diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index c8674aa004..f57cf76b7f 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -306,19 +306,15 @@ public:
 
     static void Finalize (); ///< Used internally by amrex
 
-#if 0
-    union Word
-    {
-        void*  p;
-        long long ll;
-        long double ld;
-        void (*f) ();
-    };
-    static const std::size_t align_size = sizeof(Word);
-#endif
-
     /// The alignment of allocated memory
-    static const std::size_t align_size = 16;
+    /// 256 bytes is the default alignment that cudaMalloc uses.
+    /// If the native allocation function were to give memory that
+    /// has a smaller alignment than align_size, memory from
+    /// the Arena will also only have this smaller alignment.
+    /// In practice, native allocation functions have >= 256 bytes
+    /// of alignment. In all cases, memory should be at least
+    /// aligned to std::max_align_t, usually 16.
+    static const std::size_t align_size = 256;
 
     /**
      *  \brief Return the ArenaInfo object for querying
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index 032782da8f..f7ecab4ab6 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -177,17 +177,27 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
     if (arena_info.use_cpu_memory)
     {
 #endif
-        p = std::malloc(nbytes);
+
+#ifndef _WIN32
+        p = std::aligned_alloc(align_size, nbytes);
+#else
+        p = _aligned_malloc(nbytes, align_size);
+#endif
+
         if (!p) {
             freeUnused_protected();
-            p = std::malloc(nbytes);
+#ifndef _WIN32
+            p = std::aligned_alloc(align_size, nbytes);
+#else
+            p = _aligned_malloc(nbytes, align_size);
+#endif
         }
         if (!p) {
             // out_of_memory_abort uses heap allocations,
             // so we print an error before in case it doesn't work.
             amrex::ErrorStream() <<
-                "Out of CPU memory: got nullptr from std::malloc, aborting...\n";
-            out_of_memory_abort("CPU memory", nbytes, "std::malloc returned nullptr");
+                "Out of CPU memory: got nullptr from std::aligned_alloc, aborting...\n";
+            out_of_memory_abort("CPU memory", nbytes, "std::aligned_alloc returned nullptr");
         }
 
 #ifndef _WIN32
@@ -210,7 +220,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
             if (ret != hipSuccess) { p = nullptr; },
             auto ret = cudaHostAlloc(&p, nbytes, cudaHostAllocMapped);
             if (ret != cudaSuccess) { p = nullptr; },
-            p = sycl::malloc_host(nbytes, Gpu::Device::syclContext())
+            p = sycl::aligned_alloc_host(align_size, nbytes, Gpu::Device::syclContext())
         );
 
         if (!p) {
@@ -220,7 +230,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                 if (ret != hipSuccess) { p = nullptr; },
                 ret = cudaHostAlloc(&p, nbytes, cudaHostAllocMapped);
                 if (ret != cudaSuccess) { p = nullptr; },
-                p = sycl::malloc_host(nbytes, Gpu::Device::syclContext())
+                p = sycl::aligned_alloc_host(align_size, nbytes, Gpu::Device::syclContext())
             );
         }
 
@@ -235,7 +245,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                       ": " + hipGetErrorString(ret),
                 msg = "cudaHostAlloc returned " + std::to_string(ret) +
                       ": " + cudaGetErrorString(ret),
-                msg = "sycl::malloc_host returned nullptr"
+                msg = "sycl::aligned_alloc_host returned nullptr"
             );
             out_of_memory_abort("CPU pinned memory", nbytes, msg);
         }
@@ -262,8 +272,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                 if (ret != hipSuccess) { p = nullptr; },
                 auto ret = cudaMallocManaged(&p, nbytes);
                 if (ret != cudaSuccess) { p = nullptr; },
-                p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(),
-                                        Gpu::Device::syclContext())
+                p = sycl::aligned_alloc_shared(align_size, nbytes, Gpu::Device::syclDevice(),
+                                               Gpu::Device::syclContext())
             );
 
             if (!p) {
@@ -273,8 +283,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                     if (ret != hipSuccess) { p = nullptr; },
                     ret = cudaMallocManaged(&p, nbytes);
                     if (ret != cudaSuccess) { p = nullptr; },
-                    p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(),
-                                            Gpu::Device::syclContext())
+                    p = sycl::aligned_alloc_shared(align_size, nbytes, Gpu::Device::syclDevice(),
+                                                   Gpu::Device::syclContext())
                 );
             }
 
@@ -285,7 +295,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                           ": " + hipGetErrorString(ret),
                     msg = "cudaMallocManaged returned " + std::to_string(ret) +
                           ": " + cudaGetErrorString(ret),
-                    msg = "sycl::malloc_shared returned nullptr"
+                    msg = "sycl::aligned_alloc_shared returned nullptr"
                 );
                 out_of_memory_abort("GPU managed memory", nbytes, msg);
             }
@@ -312,8 +322,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                 if (ret != hipSuccess) { p = nullptr; },
                 auto ret = cudaMalloc(&p, nbytes);
                 if (ret != cudaSuccess) { p = nullptr; },
-                p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(),
-                                        Gpu::Device::syclContext())
+                p = sycl::aligned_alloc_device(align_size, nbytes, Gpu::Device::syclDevice(),
+                                               Gpu::Device::syclContext())
             );
 
             if (!p) {
@@ -323,8 +333,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                     if (ret != hipSuccess) { p = nullptr; },
                     ret = cudaMalloc(&p, nbytes);
                     if (ret != cudaSuccess) { p = nullptr; },
-                    p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(),
-                                            Gpu::Device::syclContext())
+                    p = sycl::aligned_alloc_device(align_size, nbytes, Gpu::Device::syclDevice(),
+                                                   Gpu::Device::syclContext())
                 );
             }
 
@@ -335,7 +345,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
                           ": " + hipGetErrorString(ret),
                     msg = "cudaMalloc returned " + std::to_string(ret) +
                           ": " + cudaGetErrorString(ret),
-                    msg = "sycl::malloc_device returned nullptr"
+                    msg = "sycl::aligned_alloc_device returned nullptr"
                 );
                 out_of_memory_abort("GPU device memory", nbytes, msg);
             }
@@ -354,7 +364,12 @@ Arena::deallocate_system (void* p, std::size_t nbytes) // NOLINT(readability-mak
     {
 #endif
         if (p && arena_info.device_use_hostalloc) { AMREX_MUNLOCK(p, nbytes); }
+#ifndef _WIN32
         std::free(p);
+#else
+        _aligned_free(p);
+#endif
+
 #ifdef AMREX_USE_GPU
     }
     else if (arena_info.device_use_hostalloc)
diff --git a/Src/Base/AMReX_BArena.cpp b/Src/Base/AMReX_BArena.cpp
index 054e64b854..ea51f27545 100644
--- a/Src/Base/AMReX_BArena.cpp
+++ b/Src/Base/AMReX_BArena.cpp
@@ -3,7 +3,12 @@
 void*
 amrex::BArena::alloc (std::size_t sz_)
 {
-    void* pt = std::malloc(sz_);
+    sz_ = Arena::align(sz_);
+#ifndef _WIN32
+    void* pt = std::aligned_alloc(align_size, sz_);
+#else
+    void* pt = _aligned_malloc(sz_, align_size);
+#endif
     m_profiler.profile_alloc(pt, sz_);
     return pt;
 }
@@ -12,7 +17,11 @@ void
 amrex::BArena::free (void* pt)
 {
     m_profiler.profile_free(pt);
+#ifndef _WIN32
     std::free(pt);
+#else
+    _aligned_free(pt);
+#endif
 }
 
 bool
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index 0cd51afadd..3f74beda46 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -14,8 +14,8 @@ CArena::CArena (std::size_t hunk_size, ArenaInfo info)
     : m_hunk(align(hunk_size == 0 ? DefaultHunkSize : hunk_size))
 {
     arena_info = info;
-    BL_ASSERT(m_hunk >= hunk_size);
-    BL_ASSERT(m_hunk%Arena::align_size == 0);
+    AMREX_ALWAYS_ASSERT(m_hunk >= hunk_size);
+    AMREX_ALWAYS_ASSERT(m_hunk%Arena::align_size == 0);
 }
 
 CArena::~CArena ()
@@ -88,6 +88,8 @@ CArena::alloc_protected (std::size_t nbytes)
             }
         }
 
+        N = Arena::align(N);
+
         vp = allocate_system(N);
 
         m_used += N;