diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index c8674aa004..f57cf76b7f 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -306,19 +306,15 @@ public: static void Finalize (); ///< Used internally by amrex -#if 0 - union Word - { - void* p; - long long ll; - long double ld; - void (*f) (); - }; - static const std::size_t align_size = sizeof(Word); -#endif - /// The alignment of allocated memory - static const std::size_t align_size = 16; + /// 256 bytes is the default alignment that cudaMalloc uses. + /// If the native allocation function were to give memory that + /// has a smaller alignment than align_size, memory from + /// the Arena will also only have this smaller alignment. + /// In practice, native allocation functions have >= 256 bytes + /// of alignment. In all cases, memory should be at least + /// aligned to std::max_align_t, usually 16. + static const std::size_t align_size = 256; /** * \brief Return the ArenaInfo object for querying diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp index 032782da8f..f7ecab4ab6 100644 --- a/Src/Base/AMReX_Arena.cpp +++ b/Src/Base/AMReX_Arena.cpp @@ -177,17 +177,27 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (arena_info.use_cpu_memory) { #endif - p = std::malloc(nbytes); + +#ifndef _WIN32 + p = std::aligned_alloc(align_size, nbytes); +#else + p = _aligned_malloc(nbytes, align_size); +#endif + if (!p) { freeUnused_protected(); - p = std::malloc(nbytes); +#ifndef _WIN32 + p = std::aligned_alloc(align_size, nbytes); +#else + p = _aligned_malloc(nbytes, align_size); +#endif } if (!p) { // out_of_memory_abort uses heap allocations, // so we print an error before in case it doesn't work. amrex::ErrorStream() << - "Out of CPU memory: got nullptr from std::malloc, aborting...\n"; - out_of_memory_abort("CPU memory", nbytes, "std::malloc returned nullptr"); + "Out of CPU memory: got nullptr from std::aligned_alloc, aborting...\n"; + out_of_memory_abort("CPU memory", nbytes, "std::aligned_alloc returned nullptr"); } #ifndef _WIN32 @@ -210,7 +220,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (ret != hipSuccess) { p = nullptr; }, auto ret = cudaHostAlloc(&p, nbytes, cudaHostAllocMapped); if (ret != cudaSuccess) { p = nullptr; }, - p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()) + p = sycl::aligned_alloc_host(align_size, nbytes, Gpu::Device::syclContext()) ); if (!p) { @@ -220,7 +230,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (ret != hipSuccess) { p = nullptr; }, ret = cudaHostAlloc(&p, nbytes, cudaHostAllocMapped); if (ret != cudaSuccess) { p = nullptr; }, - p = sycl::malloc_host(nbytes, Gpu::Device::syclContext()) + p = sycl::aligned_alloc_host(align_size, nbytes, Gpu::Device::syclContext()) ); } @@ -235,7 +245,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu ": " + hipGetErrorString(ret), msg = "cudaHostAlloc returned " + std::to_string(ret) + ": " + cudaGetErrorString(ret), - msg = "sycl::malloc_host returned nullptr" + msg = "sycl::aligned_alloc_host returned nullptr" ); out_of_memory_abort("CPU pinned memory", nbytes, msg); } @@ -262,8 +272,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (ret != hipSuccess) { p = nullptr; }, auto ret = cudaMallocManaged(&p, nbytes); if (ret != cudaSuccess) { p = nullptr; }, - p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), - Gpu::Device::syclContext()) + p = sycl::aligned_alloc_shared(align_size, nbytes, Gpu::Device::syclDevice(), + Gpu::Device::syclContext()) ); if (!p) { @@ -273,8 +283,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (ret != hipSuccess) { p = nullptr; }, ret = cudaMallocManaged(&p, nbytes); if (ret != cudaSuccess) { p = nullptr; }, - p = sycl::malloc_shared(nbytes, Gpu::Device::syclDevice(), - Gpu::Device::syclContext()) + p = sycl::aligned_alloc_shared(align_size, nbytes, Gpu::Device::syclDevice(), + Gpu::Device::syclContext()) ); } @@ -285,7 +295,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu ": " + hipGetErrorString(ret), msg = "cudaMallocManaged returned " + std::to_string(ret) + ": " + cudaGetErrorString(ret), - msg = "sycl::malloc_shared returned nullptr" + msg = "sycl::aligned_alloc_shared returned nullptr" ); out_of_memory_abort("GPU managed memory", nbytes, msg); } @@ -312,8 +322,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (ret != hipSuccess) { p = nullptr; }, auto ret = cudaMalloc(&p, nbytes); if (ret != cudaSuccess) { p = nullptr; }, - p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), - Gpu::Device::syclContext()) + p = sycl::aligned_alloc_device(align_size, nbytes, Gpu::Device::syclDevice(), + Gpu::Device::syclContext()) ); if (!p) { @@ -323,8 +333,8 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu if (ret != hipSuccess) { p = nullptr; }, ret = cudaMalloc(&p, nbytes); if (ret != cudaSuccess) { p = nullptr; }, - p = sycl::malloc_device(nbytes, Gpu::Device::syclDevice(), - Gpu::Device::syclContext()) + p = sycl::aligned_alloc_device(align_size, nbytes, Gpu::Device::syclDevice(), + Gpu::Device::syclContext()) ); } @@ -335,7 +345,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu ": " + hipGetErrorString(ret), msg = "cudaMalloc returned " + std::to_string(ret) + ": " + cudaGetErrorString(ret), - msg = "sycl::malloc_device returned nullptr" + msg = "sycl::aligned_alloc_device returned nullptr" ); out_of_memory_abort("GPU device memory", nbytes, msg); } @@ -354,7 +364,12 @@ Arena::deallocate_system (void* p, std::size_t nbytes) // NOLINT(readability-mak { #endif if (p && arena_info.device_use_hostalloc) { AMREX_MUNLOCK(p, nbytes); } +#ifndef _WIN32 std::free(p); +#else + _aligned_free(p); +#endif + #ifdef AMREX_USE_GPU } else if (arena_info.device_use_hostalloc) diff --git a/Src/Base/AMReX_BArena.cpp b/Src/Base/AMReX_BArena.cpp index 054e64b854..ea51f27545 100644 --- a/Src/Base/AMReX_BArena.cpp +++ b/Src/Base/AMReX_BArena.cpp @@ -3,7 +3,12 @@ void* amrex::BArena::alloc (std::size_t sz_) { - void* pt = std::malloc(sz_); + sz_ = Arena::align(sz_); +#ifndef _WIN32 + void* pt = std::aligned_alloc(align_size, sz_); +#else + void* pt = _aligned_malloc(sz_, align_size); +#endif m_profiler.profile_alloc(pt, sz_); return pt; } @@ -12,7 +17,11 @@ void amrex::BArena::free (void* pt) { m_profiler.profile_free(pt); +#ifndef _WIN32 std::free(pt); +#else + _aligned_free(pt); +#endif } bool diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp index 0cd51afadd..3f74beda46 100644 --- a/Src/Base/AMReX_CArena.cpp +++ b/Src/Base/AMReX_CArena.cpp @@ -14,8 +14,8 @@ CArena::CArena (std::size_t hunk_size, ArenaInfo info) : m_hunk(align(hunk_size == 0 ? DefaultHunkSize : hunk_size)) { arena_info = info; - BL_ASSERT(m_hunk >= hunk_size); - BL_ASSERT(m_hunk%Arena::align_size == 0); + AMREX_ALWAYS_ASSERT(m_hunk >= hunk_size); + AMREX_ALWAYS_ASSERT(m_hunk%Arena::align_size == 0); } CArena::~CArena () @@ -88,6 +88,8 @@ CArena::alloc_protected (std::size_t nbytes) } } + N = Arena::align(N); + vp = allocate_system(N); m_used += N;