diff --git a/tests/catch/include/hip_array_common.hh b/tests/catch/include/hip_array_common.hh
new file mode 100644
index 0000000000..fd6f094f8d
--- /dev/null
+++ b/tests/catch/include/hip_array_common.hh
@@ -0,0 +1,84 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip_test_common.hh>
+
+template <class T, size_t N, hipArray_Format Format> struct type_and_size_and_format {
+  using type = T;
+  static constexpr size_t size = N;
+  static constexpr hipArray_Format format = Format;
+};
+
+// Create a map of type to scalar type, vector size and scalar type format enum.
+// This is useful for creating simpler function that depend on the vector size.
+template <typename T> struct vector_info;
+template <>
+struct vector_info<int> : type_and_size_and_format<int, 1, HIP_AD_FORMAT_SIGNED_INT32> {};
+template <> struct vector_info<float> : type_and_size_and_format<float, 1, HIP_AD_FORMAT_FLOAT> {};
+template <>
+struct vector_info<short> : type_and_size_and_format<short, 1, HIP_AD_FORMAT_SIGNED_INT16> {};
+template <>
+struct vector_info<char> : type_and_size_and_format<char, 1, HIP_AD_FORMAT_SIGNED_INT8> {};
+template <>
+struct vector_info<unsigned int>
+    : type_and_size_and_format<unsigned int, 1, HIP_AD_FORMAT_UNSIGNED_INT32> {};
+template <>
+struct vector_info<unsigned short>
+    : type_and_size_and_format<unsigned short, 1, HIP_AD_FORMAT_UNSIGNED_INT16> {};
+template <>
+struct vector_info<unsigned char>
+    : type_and_size_and_format<unsigned char, 1, HIP_AD_FORMAT_UNSIGNED_INT8> {};
+
+template <>
+struct vector_info<int2> : type_and_size_and_format<int, 2, HIP_AD_FORMAT_SIGNED_INT32> {};
+template <> struct vector_info<float2> : type_and_size_and_format<float, 2, HIP_AD_FORMAT_FLOAT> {};
+template <>
+struct vector_info<short2> : type_and_size_and_format<short, 2, HIP_AD_FORMAT_SIGNED_INT16> {};
+template <>
+struct vector_info<char2> : type_and_size_and_format<char, 2, HIP_AD_FORMAT_SIGNED_INT8> {};
+template <>
+struct vector_info<uint2>
+    : type_and_size_and_format<unsigned int, 2, HIP_AD_FORMAT_UNSIGNED_INT32> {};
+template <>
+struct vector_info<ushort2>
+    : type_and_size_and_format<unsigned short, 2, HIP_AD_FORMAT_UNSIGNED_INT16> {};
+template <>
+struct vector_info<uchar2>
+    : type_and_size_and_format<unsigned char, 2, HIP_AD_FORMAT_UNSIGNED_INT8> {};
+
+template <>
+struct vector_info<int4> : type_and_size_and_format<int, 4, HIP_AD_FORMAT_SIGNED_INT32> {};
+template <> struct vector_info<float4> : type_and_size_and_format<float, 4, HIP_AD_FORMAT_FLOAT> {};
+template <>
+struct vector_info<short4> : type_and_size_and_format<short, 4, HIP_AD_FORMAT_SIGNED_INT16> {};
+template <>
+struct vector_info<char4> : type_and_size_and_format<char, 4, HIP_AD_FORMAT_SIGNED_INT8> {};
+template <>
+struct vector_info<uint4>
+    : type_and_size_and_format<unsigned int, 4, HIP_AD_FORMAT_UNSIGNED_INT32> {};
+template <>
+struct vector_info<ushort4>
+    : type_and_size_and_format<unsigned short, 4, HIP_AD_FORMAT_UNSIGNED_INT16> {};
+template <>
+struct vector_info<uchar4>
+    : type_and_size_and_format<unsigned char, 4, HIP_AD_FORMAT_UNSIGNED_INT8> {};
\ No newline at end of file
diff --git a/tests/catch/include/resource_guards.hh b/tests/catch/include/resource_guards.hh
index 7e6179c81a..a9c7512a3d 100644
--- a/tests/catch/include/resource_guards.hh
+++ b/tests/catch/include/resource_guards.hh
@@ -19,6 +19,7 @@ THE SOFTWARE.
 
 #pragma once
 
+#include <hip_array_common.hh>
 #include <hip_test_common.hh>
 #include <hip/hip_runtime_api.h>
 
@@ -80,10 +81,8 @@ template <typename T> class LinearAllocGuard {
     }
   }
 
-  T* ptr() { return ptr_; };
-  T* const ptr() const { return ptr_; };
-  T* host_ptr() { return host_ptr_; }
-  T* const host_ptr() const { return host_ptr(); }
+  T* ptr() const { return ptr_; };
+  T* host_ptr() const { return host_ptr_; }
 
  private:
   const LinearAllocs allocation_type_;
@@ -91,6 +90,112 @@ template <typename T> class LinearAllocGuard {
   T* host_ptr_ = nullptr;
 };
 
+template <typename T> class LinearAllocGuardMultiDim {
+ protected:
+  LinearAllocGuardMultiDim(hipExtent extent) : extent_{extent} {}
+
+  ~LinearAllocGuardMultiDim() { static_cast<void>(hipFree(pitched_ptr_.ptr)); }
+
+ public:
+  T* ptr() const { return reinterpret_cast<T*>(pitched_ptr_.ptr); };
+
+  size_t pitch() const { return pitched_ptr_.pitch; }
+
+  hipExtent extent() const { return extent_; }
+
+  hipPitchedPtr pitched_ptr() const { return pitched_ptr_; }
+
+  size_t width() const { return extent_.width; }
+
+  size_t width_logical() const { return extent_.width / sizeof(T); }
+
+  size_t height() const { return extent_.height; }
+
+ public:
+  hipPitchedPtr pitched_ptr_;
+  const hipExtent extent_;
+};
+
+template <typename T> class LinearAllocGuard2D : public LinearAllocGuardMultiDim<T> {
+ public:
+  LinearAllocGuard2D(const size_t width_logical, const size_t height)
+      : LinearAllocGuardMultiDim<T>{make_hipExtent(width_logical * sizeof(T), height, 1)} {
+    HIP_CHECK(hipMallocPitch(&this->pitched_ptr_.ptr, &this->pitched_ptr_.pitch,
+                             this->extent_.width, this->extent_.height));
+  }
+
+  LinearAllocGuard2D(const LinearAllocGuard2D&) = delete;
+  LinearAllocGuard2D(LinearAllocGuard2D&&) = delete;
+};
+
+template <typename T> class LinearAllocGuard3D : public LinearAllocGuardMultiDim<T> {
+ public:
+  LinearAllocGuard3D(const size_t width_logical, const size_t height, const size_t depth)
+      : LinearAllocGuardMultiDim<T>{make_hipExtent(width_logical * sizeof(T), height, depth)} {
+    HIP_CHECK(hipMalloc3D(&this->pitched_ptr_, this->extent_));
+  }
+
+  LinearAllocGuard3D(const hipExtent extent) : LinearAllocGuardMultiDim<T>(extent) {
+    HIP_CHECK(hipMalloc3D(&this->pitched_ptr_, this->extent_));
+  }
+
+  LinearAllocGuard3D(const LinearAllocGuard3D&) = delete;
+  LinearAllocGuard3D(LinearAllocGuard3D&&) = delete;
+
+  size_t depth() const { return this->extent_.depth; }
+};
+
+template <typename T> class ArrayAllocGuard {
+ public:
+  // extent should contain logical width
+  ArrayAllocGuard(const hipExtent extent, const unsigned int flags = 0u) : extent_{extent} {
+    hipChannelFormatDesc desc = hipCreateChannelDesc<T>();
+    HIP_CHECK(hipMalloc3DArray(&ptr_, &desc, extent_, flags));
+  }
+
+  ~ArrayAllocGuard() { static_cast<void>(hipFreeArray(ptr_)); }
+
+  ArrayAllocGuard(const ArrayAllocGuard&) = delete;
+  ArrayAllocGuard(ArrayAllocGuard&&) = delete;
+
+  hipArray_t ptr() const { return ptr_; }
+
+  hipExtent extent() const { return extent_; }
+
+ private:
+  hipArray_t ptr_ = nullptr;
+  const hipExtent extent_;
+};
+
+template <typename T> class DrvArrayAllocGuard {
+ public:
+  // extent should contain width in bytes
+  DrvArrayAllocGuard(const hipExtent extent, const unsigned int flags = 0u) : extent_{extent} {
+    HIP_ARRAY3D_DESCRIPTOR desc{};
+    using vec_info = vector_info<T>;
+    desc.Format = vec_info::format;
+    desc.NumChannels = vec_info::size;
+    desc.Width = extent_.width / sizeof(T);
+    desc.Height = extent_.height;
+    desc.Depth = extent_.depth;
+    desc.Flags = flags;
+    HIP_CHECK(hipArray3DCreate(&ptr_, &desc));
+  }
+
+  ~DrvArrayAllocGuard() { static_cast<void>(hipArrayDestroy(ptr_)); }
+
+  DrvArrayAllocGuard(const DrvArrayAllocGuard&) = delete;
+  DrvArrayAllocGuard(DrvArrayAllocGuard&&) = delete;
+
+  hiparray ptr() const { return ptr_; }
+
+  hipExtent extent() const { return extent_; }
+
+ private:
+  hiparray ptr_ = nullptr;
+  const hipExtent extent_;
+};
+
 enum class Streams { nullstream, perThread, created };
 
 class StreamGuard {
diff --git a/tests/catch/include/utils.hh b/tests/catch/include/utils.hh
index 9edffc6f7c..bbab2322fe 100644
--- a/tests/catch/include/utils.hh
+++ b/tests/catch/include/utils.hh
@@ -54,6 +54,37 @@ void ArrayFindIfNot(T* const array, const T expected_value, const size_t num_ele
   ArrayFindIfNot(array, array + num_elements, expected_value);
 }
 
+template <typename T, typename F>
+void PitchedMemoryVerify(T* const ptr, const size_t pitch, const size_t width, const size_t height,
+                         const size_t depth, F expected_value_generator) {
+  for (size_t z = 0; z < depth; ++z) {
+    for (size_t y = 0; y < height; ++y) {
+      for (size_t x = 0; x < width; ++x) {
+        const auto slice = reinterpret_cast<uint8_t*>(ptr) + pitch * height * z;
+        const auto row = slice + pitch * y;
+        if (reinterpret_cast<T*>(row)[x] != expected_value_generator(x, y, z)) {
+          INFO("Mismatch at indices: " << x << ", " << y << ", " << z);
+          REQUIRE(reinterpret_cast<T*>(row)[x] == expected_value_generator(x, y, z));
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename F>
+void PitchedMemorySet(T* const ptr, const size_t pitch, const size_t width, const size_t height,
+                      const size_t depth, F expected_value_generator) {
+  for (size_t z = 0; z < depth; ++z) {
+    for (size_t y = 0; y < height; ++y) {
+      for (size_t x = 0; x < width; ++x) {
+        const auto slice = reinterpret_cast<uint8_t*>(ptr) + pitch * height * z;
+        const auto row = slice + pitch * y;
+        reinterpret_cast<T*>(row)[x] = expected_value_generator(x, y, z);
+      }
+    }
+  }
+}
+
 template <typename T>
 __global__ void VectorIncrement(T* const vec, const T increment_value, size_t N) {
   size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
@@ -82,6 +113,18 @@ static __global__ void Delay(uint32_t interval, const uint32_t ticks_per_ms) {
   }
 }
 
+template <typename T>
+__global__ void Iota(T* const out, size_t pitch, size_t w, size_t h, size_t d) {
+  const auto x = blockIdx.x * blockDim.x + threadIdx.x;
+  const auto y = blockIdx.y * blockDim.y + threadIdx.y;
+  const auto z = blockIdx.z * blockDim.z + threadIdx.z;
+  if (x < w && y < h && z < d) {
+    char* const slice = reinterpret_cast<char*>(out) + pitch * h * z;
+    char* const row = slice + pitch * y;
+    reinterpret_cast<T*>(row)[x] = z * w * h + y * w + x;
+  }
+}
+
 inline void LaunchDelayKernel(const std::chrono::milliseconds interval, const hipStream_t stream) {
   int ticks_per_ms = 0;
   // Clock rate is in kHz => number of clock ticks in a millisecond
diff --git a/tests/catch/unit/memory/hipArray3DCreate.cc b/tests/catch/unit/memory/hipArray3DCreate.cc
index 973868eded..4cf189611b 100644
--- a/tests/catch/unit/memory/hipArray3DCreate.cc
+++ b/tests/catch/unit/memory/hipArray3DCreate.cc
@@ -20,6 +20,7 @@ THE SOFTWARE.
 #include <limits>
 #include "DriverContext.hh"
 #include "hipArrayCommon.hh"
+#include "hip_array_common.hh"
 #include "hip_test_common.hh"
 
 namespace {
diff --git a/tests/catch/unit/memory/hipArrayCommon.hh b/tests/catch/unit/memory/hipArrayCommon.hh
index b40014b490..b0beeb3126 100644
--- a/tests/catch/unit/memory/hipArrayCommon.hh
+++ b/tests/catch/unit/memory/hipArrayCommon.hh
@@ -26,66 +26,6 @@ THE SOFTWARE.
 
 constexpr size_t BlockSize = 16;
 
-template <class T, size_t N, hipArray_Format Format> struct type_and_size_and_format {
-  using type = T;
-  static constexpr size_t size = N;
-  static constexpr hipArray_Format format = Format;
-};
-
-// Create a map of type to scalar type, vector size and scalar type format enum.
-// This is useful for creating simpler function that depend on the vector size.
-template <typename T> struct vector_info;
-template <>
-struct vector_info<int> : type_and_size_and_format<int, 1, HIP_AD_FORMAT_SIGNED_INT32> {};
-template <> struct vector_info<float> : type_and_size_and_format<float, 1, HIP_AD_FORMAT_FLOAT> {};
-template <>
-struct vector_info<short> : type_and_size_and_format<short, 1, HIP_AD_FORMAT_SIGNED_INT16> {};
-template <>
-struct vector_info<char> : type_and_size_and_format<char, 1, HIP_AD_FORMAT_SIGNED_INT8> {};
-template <>
-struct vector_info<unsigned int>
-    : type_and_size_and_format<unsigned int, 1, HIP_AD_FORMAT_UNSIGNED_INT32> {};
-template <>
-struct vector_info<unsigned short>
-    : type_and_size_and_format<unsigned short, 1, HIP_AD_FORMAT_UNSIGNED_INT16> {};
-template <>
-struct vector_info<unsigned char>
-    : type_and_size_and_format<unsigned char, 1, HIP_AD_FORMAT_UNSIGNED_INT8> {};
-
-template <>
-struct vector_info<int2> : type_and_size_and_format<int, 2, HIP_AD_FORMAT_SIGNED_INT32> {};
-template <> struct vector_info<float2> : type_and_size_and_format<float, 2, HIP_AD_FORMAT_FLOAT> {};
-template <>
-struct vector_info<short2> : type_and_size_and_format<short, 2, HIP_AD_FORMAT_SIGNED_INT16> {};
-template <>
-struct vector_info<char2> : type_and_size_and_format<char, 2, HIP_AD_FORMAT_SIGNED_INT8> {};
-template <>
-struct vector_info<uint2>
-    : type_and_size_and_format<unsigned int, 2, HIP_AD_FORMAT_UNSIGNED_INT32> {};
-template <>
-struct vector_info<ushort2>
-    : type_and_size_and_format<unsigned short, 2, HIP_AD_FORMAT_UNSIGNED_INT16> {};
-template <>
-struct vector_info<uchar2>
-    : type_and_size_and_format<unsigned char, 2, HIP_AD_FORMAT_UNSIGNED_INT8> {};
-
-template <>
-struct vector_info<int4> : type_and_size_and_format<int, 4, HIP_AD_FORMAT_SIGNED_INT32> {};
-template <> struct vector_info<float4> : type_and_size_and_format<float, 4, HIP_AD_FORMAT_FLOAT> {};
-template <>
-struct vector_info<short4> : type_and_size_and_format<short, 4, HIP_AD_FORMAT_SIGNED_INT16> {};
-template <>
-struct vector_info<char4> : type_and_size_and_format<char, 4, HIP_AD_FORMAT_SIGNED_INT8> {};
-template <>
-struct vector_info<uint4>
-    : type_and_size_and_format<unsigned int, 4, HIP_AD_FORMAT_UNSIGNED_INT32> {};
-template <>
-struct vector_info<ushort4>
-    : type_and_size_and_format<unsigned short, 4, HIP_AD_FORMAT_UNSIGNED_INT16> {};
-template <>
-struct vector_info<uchar4>
-    : type_and_size_and_format<unsigned char, 4, HIP_AD_FORMAT_UNSIGNED_INT8> {};
-
 // read from a texture using normalized coordinates
 constexpr size_t ChannelToRead = 1;
 template <typename T>
diff --git a/tests/catch/unit/memory/hipArrayCreate.cc b/tests/catch/unit/memory/hipArrayCreate.cc
index 6cc535593a..70a8636922 100644
--- a/tests/catch/unit/memory/hipArrayCreate.cc
+++ b/tests/catch/unit/memory/hipArrayCreate.cc
@@ -27,6 +27,7 @@ hipArrayCreate API test scenarios
 #include <array>
 #include <numeric>
 #include <hip_test_common.hh>
+#include <hip_array_common.hh>
 #include "hipArrayCommon.hh"
 #include "DriverContext.hh"
 
diff --git a/tests/catch/unit/memory/hipFree.cc b/tests/catch/unit/memory/hipFree.cc
index 1248deebc1..b29854271c 100644
--- a/tests/catch/unit/memory/hipFree.cc
+++ b/tests/catch/unit/memory/hipFree.cc
@@ -22,6 +22,7 @@ THE SOFTWARE.
 
 
 #include <hip_test_common.hh>
+#include <hip_array_common.hh>
 #include "hipArrayCommon.hh"
 #include "DriverContext.hh"
 
diff --git a/tests/catch/unit/memory/hipMallocArray.cc b/tests/catch/unit/memory/hipMallocArray.cc
index b6c4939b1e..530eb11077 100644
--- a/tests/catch/unit/memory/hipMallocArray.cc
+++ b/tests/catch/unit/memory/hipMallocArray.cc
@@ -26,6 +26,7 @@ hipMallocArray API test scenarios
 */
 
 #include <hip_test_common.hh>
+#include <hip_array_common.hh>
 #include <limits>
 #include <numeric>
 #include "hipArrayCommon.hh"
diff --git a/tests/catch/unit/memory/hipMemPrefetchAsync.cc b/tests/catch/unit/memory/hipMemPrefetchAsync.cc
index 17ef618b77..92dc8c9a4b 100644
--- a/tests/catch/unit/memory/hipMemPrefetchAsync.cc
+++ b/tests/catch/unit/memory/hipMemPrefetchAsync.cc
@@ -1,13 +1,15 @@
 /*
-Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
+
 The above copyright notice and this permission notice shall be included in
 all copies or substantial portions of the Software.
+
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
@@ -17,9 +19,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+#include <vector>
+
 #include <hip_test_common.hh>
-// Kernel function
-__global__ void MemPrftchAsyncKernel(int* C_d, const int* A_d, size_t N) {
+#include <hip/hip_runtime_api.h>
+#include <utils.hh>
+#include <resource_guards.hh>
+
+std::vector<int> GetDevicesWithPrefetchSupport() {
+  const auto device_count = HipTest::getDeviceCount();
+  std::vector<int> supported_devices;
+  supported_devices.reserve(device_count + 1);
+  for (int i = 0; i < device_count; ++i) {
+    if (DeviceAttributesSupport(i, hipDeviceAttributeManagedMemory,
+                                hipDeviceAttributeConcurrentManagedAccess)) {
+      supported_devices.push_back(i);
+    }
+  }
+  return supported_devices;
+}
+
+__global__ void MemPrefetchAsyncKernel(int* C_d, const int* A_d, size_t N) {
   size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
   size_t stride = blockDim.x * gridDim.x;
   for (size_t i = offset; i < N; i += stride) {
@@ -27,98 +47,120 @@ __global__ void MemPrftchAsyncKernel(int* C_d, const int* A_d, size_t N) {
   }
 }
 
+TEST_CASE("Unit_hipMemPrefetchAsync_Basic") {
+  const auto supported_devices = GetDevicesWithPrefetchSupport();
+  if (supported_devices.empty()) {
+    HipTest::HIP_SKIP_TEST("Test need at least one device with managed memory support");
+  }
 
-static int HmmAttrPrint() {
-  int managed = 0;
-  INFO("The following are the attribute values related to HMM for"
-         " device 0:\n");
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-              hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
-  INFO("hipDeviceAttributeDirectManagedMemAccessFromHost: " << managed);
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-                                 hipDeviceAttributeConcurrentManagedAccess, 0));
-  INFO("hipDeviceAttributeConcurrentManagedAccess: " << managed);
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-                                 hipDeviceAttributePageableMemoryAccess, 0));
-  INFO("hipDeviceAttributePageableMemoryAccess: " << managed);
-  HIP_CHECK(hipDeviceGetAttribute(&managed,
-              hipDeviceAttributePageableMemoryAccessUsesHostPageTables, 0));
-  INFO("hipDeviceAttributePageableMemoryAccessUsesHostPageTables:"
-         << managed);
-
-  HIP_CHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeManagedMemory,
-                                  0));
-  INFO("hipDeviceAttributeManagedMemory: " << managed);
-  return managed;
-}
+  LinearAllocGuard<int> alloc1(LinearAllocs::hipMallocManaged, kPageSize);
+  const auto count = kPageSize / sizeof(*alloc1.ptr());
+  constexpr auto fill_value = 42;
+  std::fill_n(alloc1.ptr(), count, fill_value);
 
-/*
-  Test Description: This test prefetches the memory to each of the available
-  devices and launch kernel followed by result verification
-  At the end the memory is prefetched to Host and kernel is launched followed
-  by result verification.
-*/
+  for (const auto device : supported_devices) {
+    HIP_CHECK(hipSetDevice(device));
+    LinearAllocGuard<int> alloc2(LinearAllocs::hipMallocManaged, kPageSize);
+    StreamGuard sg(Streams::created);
+    HIP_CHECK(hipMemPrefetchAsync(alloc1.ptr(), kPageSize, device, sg.stream()));
+    MemPrefetchAsyncKernel<<<count / 1024 + 1, 1024, 0, sg.stream()>>>(alloc2.ptr(), alloc1.ptr(),
+                                                                       count);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipStreamSynchronize(sg.stream()));
+    ArrayFindIfNot(alloc1.ptr(), fill_value, count);
+    ArrayFindIfNot(alloc2.ptr(), fill_value * fill_value, count);
+  }
 
-TEST_CASE("Unit_hipMemPrefetchAsync") {
-  int MangdMem = HmmAttrPrint();
-  if (MangdMem == 1) {
-    bool IfTestPassed = true;
-    int A_CONST = 123, MEM_SIZE = (8192 * sizeof(int));
-    int *devPtr1 = NULL, *devPtr2 = NULL, NumDevs = 0, flag = 0;
-    hipStream_t strm;
-    HIP_CHECK(hipMallocManaged(&devPtr1, MEM_SIZE));
-    HIP_CHECK(hipMallocManaged(&devPtr2, MEM_SIZE));
-    HIP_CHECK(hipGetDeviceCount(&NumDevs));
-    // Initializing the memory
-    for (uint32_t k = 0; k < (MEM_SIZE/sizeof(int)); ++k) {
-      devPtr1[k] = A_CONST;
-      devPtr2[k] = 0;
-    }
+  HIP_CHECK(hipMemPrefetchAsync(alloc1.ptr(), kPageSize, hipCpuDeviceId));
+  HIP_CHECK(hipStreamSynchronize(nullptr));
+  ArrayFindIfNot(alloc1.ptr(), fill_value, count);
+}
 
+TEST_CASE("Unit_hipMemPrefetchAsync_Sync_Behavior") {
+  const auto supported_devices = GetDevicesWithPrefetchSupport();
+  if (supported_devices.empty()) {
+    HipTest::HIP_SKIP_TEST("Test need at least one device with managed memory support");
+  }
+  const auto device = supported_devices.front();
+  const auto stream_type = GENERATE(Streams::nullstream, Streams::perThread, Streams::created);
 
-    for (int i = 0; i < NumDevs;  ++i) {
-      HIP_CHECK(hipSetDevice(i));
-      HIP_CHECK(hipStreamCreate(&strm));
-      HIP_CHECK(hipMemPrefetchAsync(devPtr1, MEM_SIZE, i, strm));
-      HIP_CHECK(hipStreamSynchronize(strm));
-      MemPrftchAsyncKernel<<<32, (MEM_SIZE/sizeof(int)/32)>>>(devPtr2, devPtr1,
-                                                       MEM_SIZE/sizeof(int));
-      for (uint32_t m = 0; m < (MEM_SIZE/sizeof(int)); ++m) {
-        if (devPtr1[m] != (A_CONST * A_CONST)) {
-          flag = 1;
-        }
-      }
-      HIP_CHECK(hipStreamDestroy(strm));
-      if (!flag) {
-        INFO("Test failed for device: " << i);
-        IfTestPassed = false;
-        flag = 0;
-      }
-    }
-    // The memory will be prefetched from last gpu in the system to the host
-    // memory and kernel is launched followed by result verification.
-    HIP_CHECK(hipStreamCreate(&strm));
-    HIP_CHECK(hipMemPrefetchAsync(devPtr1, MEM_SIZE, hipCpuDeviceId, strm));
-    HIP_CHECK(hipStreamSynchronize(strm));
-    MemPrftchAsyncKernel<<<32, (MEM_SIZE/sizeof(int)/32)>>>(devPtr2, devPtr1,
-                                                     MEM_SIZE/sizeof(int));
-    for (uint32_t m = 0; m < (MEM_SIZE/sizeof(int)); ++m) {
-      if (devPtr1[m] != (A_CONST * A_CONST)) {
-        flag = 1;
-      }
-    }
-    HIP_CHECK(hipStreamDestroy(strm));
-    if (!flag) {
-      INFO("Failed to prefetch the memory to System space.\n");
-      IfTestPassed = false;
-      flag = 0;
-    }
+  StreamGuard sg(stream_type);
+  LinearAllocGuard<void> alloc(LinearAllocs::hipMallocManaged, kPageSize);
+  LaunchDelayKernel(std::chrono::milliseconds{100}, sg.stream());
+  HIP_CHECK(hipMemPrefetchAsync(alloc.ptr(), kPageSize, device, sg.stream()));
+  HIP_CHECK_ERROR(hipStreamQuery(sg.stream()), hipErrorNotReady);
+  HIP_CHECK(hipStreamSynchronize(sg.stream()));
+}
 
-    HIP_CHECK(hipFree(devPtr1));
-    HIP_CHECK(hipFree(devPtr2));
-    REQUIRE(IfTestPassed);
-  } else {
-    SUCCEED("GPU 0 doesn't support hipDeviceAttributeManagedMemory "
-           "attribute. Hence skipping the testing with Pass result.\n");
+TEST_CASE("Unit_hipMemPrefetchAsync_Rounding_Behavior") {
+  auto supported_devices = GetDevicesWithPrefetchSupport();
+  if (supported_devices.empty()) {
+    HipTest::HIP_SKIP_TEST("Test need at least one device with managed memory support");
   }
+  const auto device = supported_devices.front();
+  LinearAllocGuard<uint8_t> alloc(LinearAllocs::hipMallocManaged, 3 * kPageSize);
+  REQUIRE_FALSE(reinterpret_cast<intptr_t>(alloc.ptr()) % kPageSize);
+  const auto [offset, width] =
+      GENERATE_COPY(std::make_pair(kPageSize / 4, kPageSize / 2),   // Withing page
+                    std::make_pair(kPageSize / 2, kPageSize),       // Across page border
+                    std::make_pair(kPageSize / 2, kPageSize * 2));  // Across two page borders
+  HIP_CHECK(hipMemPrefetchAsync(alloc.ptr() + offset, width, device));
+  HIP_CHECK(hipStreamSynchronize(nullptr));
+  constexpr auto RoundDown = [](const intptr_t a, const intptr_t n) { return a - a % n; };
+  constexpr auto RoundUp = [RoundDown](const intptr_t a, const intptr_t n) {
+    return RoundDown(a + n - 1, n);
+  };
+  const auto base = alloc.ptr();
+  const auto rounded_up = RoundUp(offset + width, kPageSize);
+  unsigned int attribute = 0;
+  HIP_CHECK(hipMemRangeGetAttribute(&attribute, sizeof(attribute),
+                                    hipMemRangeAttributeLastPrefetchLocation,
+                                    reinterpret_cast<void*>(base), rounded_up));
+  REQUIRE(device == attribute);
+  HIP_CHECK(hipMemRangeGetAttribute(&attribute, sizeof(attribute),
+                                    hipMemRangeAttributeLastPrefetchLocation, alloc.ptr(),
+                                    3 * kPageSize));
+  REQUIRE((rounded_up == 3 * kPageSize ? device : hipInvalidDeviceId) == attribute);
 }
+
+TEST_CASE("Unit_hipMemPrefetchAsync_Negative_Parameters") {
+  auto supported_devices = GetDevicesWithPrefetchSupport();
+  if (supported_devices.empty()) {
+    HipTest::HIP_SKIP_TEST("Test need at least one device with managed memory support");
+  }
+  supported_devices.push_back(hipCpuDeviceId);
+  const auto device = GENERATE_COPY(from_range(supported_devices));
+
+  LinearAllocGuard<void> alloc(LinearAllocs::hipMallocManaged, kPageSize);
+  SECTION("dev_ptr == nullptr") {
+    HIP_CHECK_ERROR(hipMemPrefetchAsync(nullptr, kPageSize, device), hipErrorInvalidValue);
+  }
+
+#if HT_NVIDIA
+  SECTION("dev_ptr points to non-managed memory") {
+    LinearAllocGuard<void> alloc(LinearAllocs::hipMalloc, kPageSize);
+    HIP_CHECK_ERROR(hipMemPrefetchAsync(alloc.ptr(), kPageSize, device), hipErrorInvalidValue);
+  }
+#endif
+
+  SECTION("count == 0") {
+    HIP_CHECK_ERROR(hipMemPrefetchAsync(alloc.ptr(), 0, device), hipErrorInvalidValue);
+  }
+
+  SECTION("count larger than allocation size") {
+    HIP_CHECK_ERROR(hipMemPrefetchAsync(alloc.ptr(), kPageSize + 1, device), hipErrorInvalidValue);
+  }
+
+  SECTION("Invalid device") {
+    HIP_CHECK_ERROR(hipMemPrefetchAsync(alloc.ptr(), kPageSize, hipInvalidDeviceId),
+                    hipErrorInvalidDevice);
+  }
+
+  SECTION("Invalid stream") {
+    hipStream_t stream;
+    HIP_CHECK(hipStreamCreate(&stream));
+    HIP_CHECK(hipStreamDestroy(stream));
+    HIP_CHECK_ERROR(hipMemPrefetchAsync(alloc.ptr(), kPageSize, device, stream),
+                    hipErrorContextIsDestroyed);
+  }
+}
\ No newline at end of file