Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
408 changes: 408 additions & 0 deletions .github/workflows/getdeps_windows_arm64.yml

Large diffs are not rendered by default.

32 changes: 28 additions & 4 deletions build/fbcode_builder/getdeps/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ def _get_cmd_prefix(self) -> list[str]:
wrapper = os.path.join(self.build_dir, "succeed.bat")
with open(wrapper, "w") as f:
f.write("@echo off\n")
f.write(f'call "{vcvarsall}" amd64\n')
arch = "arm64" if self.build_opts.is_arm() else "amd64"
f.write(f'call "{vcvarsall}" {arch}\n')
f.write("set ERRORLEVEL=0\n")
f.write("exit /b 0\n")
return [wrapper, "&&"]
Expand Down Expand Up @@ -1424,7 +1425,7 @@ def _build(self, reconfigure: bool) -> None:
# jom is compatible with nmake, adds the /j argument for parallel build
make = "jom.exe"
make_j_args = ["/j%s" % self.num_jobs]
args = ["VC-WIN64A-masm", "-utf-8"]
args = ["VC-WIN64-ARM", "-utf-8"] if self.build_opts.is_arm() else ["VC-WIN64A-masm", "-utf-8"]
# fixes "if multiple CL.EXE write to the same .PDB file, please use /FS"
extra_args = ["/FS"]
elif self.build_opts.is_darwin():
Expand Down Expand Up @@ -1501,6 +1502,13 @@ def __init__(

def _build(self, reconfigure: bool) -> None:
env = self._compute_env()
if self.build_opts.is_arm():
if not self.b2_args:
self.b2_args = []
self.b2_args += [
"pch=off",
"context-impl=winfib",
]
linkage: list[str] = ["static"]
if self.build_opts.is_windows() or self.build_opts.shared_libs:
linkage.append("shared")
Expand All @@ -1520,7 +1528,10 @@ def _build(self, reconfigure: bool) -> None:
if self.build_opts.is_windows():
bootstrap = os.path.join(self.src_dir, "bootstrap.bat")
self._check_cmd([bootstrap] + bootstrap_args, cwd=self.src_dir, env=env)
args += ["address-model=64"]
if self.build_opts.is_arm():
args += ["address-model=64", "architecture=arm"]
else:
args += ["address-model=64"]
else:
bootstrap = os.path.join(self.src_dir, "bootstrap.sh")
self._check_cmd(
Expand All @@ -1529,6 +1540,19 @@ def _build(self, reconfigure: bool) -> None:
env=env,
)

b2_args = list(self.b2_args)
if self.build_opts.is_arm():
ARM_EXCLUDED_LIBS = {
"coroutine",
"graph",
"graph_parallel",
"mpi",
"python",
}
b2_args = [
arg for arg in b2_args
if not any(arg == f"--with-{lib}" for lib in ARM_EXCLUDED_LIBS)
]
b2 = os.path.join(self.src_dir, "b2")
self._check_cmd(
[
Expand All @@ -1538,7 +1562,7 @@ def _build(self, reconfigure: bool) -> None:
"--builddir=%s" % self.build_dir,
]
+ args
+ self.b2_args
+ b2_args
+ [
"link=%s" % link,
"runtime-link=shared",
Expand Down
4 changes: 3 additions & 1 deletion build/fbcode_builder/getdeps/buildopts.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,9 @@ def compute_env_for_install_dirs(
"xplat/third-party/yarn/",
yarn_exe,
)
node_exe = "node-win-x64.exe" if self.is_windows() else "node"
node_exe = "node"
if self.is_windows():
node_exe = "node-win-arm64.exe" if self.is_arm() else "node-win-x64.exe"
env["NODE_BIN"] = os.path.join(
# pyre-fixme[6]: For 1st argument expected `LiteralString` but got
# `Optional[str]`.
Expand Down
2 changes: 1 addition & 1 deletion build/fbcode_builder/getdeps/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def is_current_host_arm() -> bool:
return "ARM64" in os.uname().version
else:
machine = platform.machine().lower()
return "arm" in machine or "aarch" in machine
return "arm" in machine or "aarch" in machine or "arm64" in machine


class HostType:
Expand Down
8 changes: 6 additions & 2 deletions folly/Portability.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ constexpr bool kHasUnalignedAccess = false;
#define FOLLY_ARM 0
#endif

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#define FOLLY_AARCH64 1
#else
#define FOLLY_AARCH64 0
Expand Down Expand Up @@ -361,7 +361,7 @@ constexpr auto kHasWeakSymbols = false;
(FOLLY_SSE > major || FOLLY_SSE == major && FOLLY_SSE_MINOR >= minor)

#ifndef FOLLY_NEON
#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__CUDACC__)
#if (defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)) && !defined(__CUDACC__)
#define FOLLY_NEON 1
#else
#define FOLLY_NEON 0
Expand Down Expand Up @@ -646,6 +646,10 @@ constexpr auto kCpplibVer = 0;
#if defined(__NVCC__)
// For now, NVCC matches other compilers but does not offer coroutines.
#define FOLLY_HAS_COROUTINES 0
#elif defined(_MSC_VER) && defined(_M_ARM64)
// MSVC ARM64: coroutine codegen triggers unsupported relocations at link time,
// leading to build failures. Disable coroutines for this target.
#define FOLLY_HAS_COROUTINES 0
#elif defined(_WIN32) && defined(__clang__) && !defined(LLVM_COROUTINES) && \
!defined(LLVM_COROUTINES_CPP20)
// LLVM and MSVC coroutines are ABI incompatible, so for the MSVC implementation
Expand Down
28 changes: 15 additions & 13 deletions folly/algorithm/simd/Movemask.h
Original file line number Diff line number Diff line change
Expand Up @@ -146,13 +146,13 @@ FOLLY_ERASE auto movemaskChars16Aarch64(uint8x16_t reg) {
return std::pair{bits, std::integral_constant<std::uint32_t, 4>{}};
}

template <typename Reg>
template <typename Scalar, typename Reg>
FOLLY_ERASE uint64x1_t asUint64x1Aarch64(Reg reg) {
if constexpr (std::is_same_v<Reg, uint64x1_t>) {
if constexpr (sizeof(Scalar) == 8) {
return reg;
} else if constexpr (std::is_same_v<Reg, uint32x2_t>) {
} else if constexpr (sizeof(Scalar) == 4) {
return vreinterpret_u64_u32(reg);
} else if constexpr (std::is_same_v<Reg, uint16x4_t>) {
} else if constexpr (sizeof(Scalar) == 2) {
return vreinterpret_u64_u16(reg);
} else {
return vreinterpret_u64_u8(reg);
Expand All @@ -164,16 +164,18 @@ FOLLY_ERASE uint64x1_t asUint64x1Aarch64(Reg reg) {
template <typename Scalar>
template <typename Reg>
FOLLY_ERASE auto movemask_fn<Scalar>::operator()(Reg reg) const {
if constexpr (std::is_same_v<Reg, uint64x2_t>) {
return movemask<std::uint32_t>(vmovn_u64(reg));
} else if constexpr (std::is_same_v<Reg, uint32x4_t>) {
return movemask<std::uint16_t>(vmovn_u32(reg));
} else if constexpr (std::is_same_v<Reg, uint16x8_t>) {
return movemask<std::uint8_t>(vmovn_u16(reg));
} else if constexpr (std::is_same_v<Reg, uint8x16_t>) {
return detail::movemaskChars16Aarch64(reg);
if constexpr (sizeof(Reg) == 16) {
if constexpr (sizeof(Scalar) == 1) {
return detail::movemaskChars16Aarch64(reg);
} else if constexpr (sizeof(Scalar) == 2) {
return movemask<std::uint8_t>(vmovn_u16(reg));
} else if constexpr (sizeof(Scalar) == 4) {
return movemask<std::uint16_t>(vmovn_u32(reg));
} else {
return movemask<std::uint32_t>(vmovn_u64(reg));
}
} else {
std::uint64_t mmask = vget_lane_u64(detail::asUint64x1Aarch64(reg), 0);
std::uint64_t mmask = vget_lane_u64(detail::asUint64x1Aarch64<Scalar>(reg), 0);
return std::pair{
mmask, std::integral_constant<std::uint32_t, sizeof(Scalar) * 8>{}};
}
Expand Down
10 changes: 6 additions & 4 deletions folly/algorithm/simd/find_first_of.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,8 @@ class default_vector_finder_first_op_of {
vld1q_u8(reinterpret_cast<uint8_t const*>(input.data() + size));
auto vmask = vdupq_n_u8(Eq ? 0 : -1);
for (auto const a : alphabet_) {
auto const veq = vhaystack == vdupq_n_u8(a);
vmask = Eq ? veq | vmask : ~veq & vmask;
auto const veq = vceqq_u8(vhaystack, vdupq_n_u8(static_cast<uint8_t>(a)));
vmask = Eq ? vorrq_u8(veq, vmask) : vbicq_u8(vmask, veq);
}
#endif
if (auto const [word, bits] = movemask<CharT>(vmask); word) {
Expand Down Expand Up @@ -398,10 +398,12 @@ class shuffle_vector_finder_first_op_of {
auto const vtable = reinterpret_cast<uint8x16_t const*>(table);
auto const vhaystack =
vld1q_u8(reinterpret_cast<uint8_t const*>(input.data() + size));
auto const vhaystack_lo = vandq_u8(vhaystack, vdupq_n_u8(15));
auto vmask = vdupq_n_u8(Eq ? 0 : -1);
for (size_t i = 0; i < shuffle_.rounds; ++i) {
auto const veq = vqtbl1q_u8(vtable[i], vhaystack & 15) == vhaystack;
vmask = Eq ? veq | vmask : ~veq & vmask;
auto const vshuffle = vqtbl1q_u8(vtable[i], vhaystack_lo);
auto const veq = vceqq_u8(vshuffle, vhaystack);
vmask = Eq ? vorrq_u8(veq, vmask) : vbicq_u8(vmask, veq);
}
#endif
if (auto const [word, bits] = movemask<CharT>(vmask); word) {
Expand Down
2 changes: 1 addition & 1 deletion folly/chrono/Hardware.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ inline std::uint64_t hardware_timestamp() {
return __rdtsc();
#elif defined(__GNUC__) && (defined(__i386__) || FOLLY_X64)
return __builtin_ia32_rdtsc();
#elif FOLLY_AARCH64 && !FOLLY_MOBILE
#elif FOLLY_AARCH64 && !FOLLY_MOBILE && !defined(_MSC_VER)
uint64_t cval;
asm volatile("mrs %0, cntvct_el0" : "=r"(cval));
return cval;
Expand Down
24 changes: 21 additions & 3 deletions folly/compression/Instructions.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

#include <glog/logging.h>

#ifdef _MSC_VER
#if defined(_MSC_VER) && defined(_M_ARM64)
#include <intrin.h>
#elif defined(_MSC_VER)
#include <immintrin.h>
#endif

Expand All @@ -45,15 +47,31 @@ struct Default {
static std::string_view name() noexcept { return "Default"; }
static bool supported(const folly::CpuId& /* cpuId */ = {}) { return true; }
static FOLLY_ALWAYS_INLINE uint64_t popcount(uint64_t value) {
#if defined(_MSC_VER) && defined(_M_ARM64)
return uint64_t(__popcnt64(value));
#else
return uint64_t(__builtin_popcountll(value));
#endif
}
static FOLLY_ALWAYS_INLINE int ctz(uint64_t value) {
DCHECK_GT(value, 0u);
return __builtin_ctzll(value);
#if defined(_MSC_VER) && defined(_M_ARM64)
unsigned long index;
_BitScanForward64(&index, value);
return int(index);
#else
return __builtin_ctzll(value);
#endif
}
static FOLLY_ALWAYS_INLINE int clz(uint64_t value) {
DCHECK_GT(value, 0u);
return __builtin_clzll(value);
#if defined(_MSC_VER) && defined(_M_ARM64)
unsigned long index;
_BitScanReverse64(&index, value);
return 63 - int(index);
#else
return __builtin_clzll(value);
#endif
}
static FOLLY_ALWAYS_INLINE uint64_t blsr(uint64_t value) {
return value & (value - 1);
Expand Down
8 changes: 7 additions & 1 deletion folly/container/detail/F14Table.h
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,13 @@ std::pair<std::size_t, std::size_t> splitHashImpl(std::size_t hash) {
// was 48 bytes of assembly (even after using the same multiplicand
// for both steps) and this one was 27 bytes, for example.
auto const kMul = 0xc4ceb9fe1a85ec53ULL;
#ifdef _WIN32
#if defined(_MSC_VER) && defined(_M_ARM64)
// MSVC ARM64: no _mul128, no __int128.
// Use 32-bit halves to compute the 128-bit product manually.
uint64_t lo = hash * kMul;
// __umulh gives the high 64 bits of an unsigned 64x64 multiply
uint64_t hi = __umulh(hash, kMul);
#elif defined(_WIN32) && !defined(FOLLY_AARCH64)
__int64 signedHi;
__int64 signedLo = _mul128(
static_cast<__int64>(hash), static_cast<__int64>(kMul), &signedHi);
Expand Down
115 changes: 113 additions & 2 deletions folly/fibers/BoostContextCompatibility.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,127 @@

#pragma once

#include <boost/context/detail/fcontext.hpp>
#include <glog/logging.h>
#include <folly/Function.h>

/**
* Wrappers for different versions of boost::context library
* API reference for different versions
* Boost 1.61:
* https://github.com/boostorg/context/blob/boost-1.61.0/include/boost/context/detail/fcontext.hpp
*
* On Windows ARM64, boost::context assembly stubs (jump_fcontext / make_fcontext)
* are not currently supported, so fall back to the native Windows Fibers API.
*/

#include <folly/Function.h>
#if defined(_WIN32) && defined(_M_ARM64)

#ifndef NOMINMAX
#define NOMINMAX
#endif
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>

namespace folly {
namespace fibers {

class FiberImpl {
public:
FiberImpl(
folly::Function<void()> func,
unsigned char* /*stackLimit*/,
size_t stackSize)
: func_(std::move(func)) {
fiber_ = CreateFiberEx(
stackSize,
stackSize,
FIBER_FLAG_FLOAT_SWITCH,
&FiberImpl::fiberFunc,
this);
CHECK(fiber_ != nullptr)
<< "CreateFiberEx failed: " << GetLastError();
}

~FiberImpl() {
if (fiber_) {
DCHECK(GetCurrentFiber() != fiber_)
<< "Destroying fiber while it is active";
DeleteFiber(std::exchange(fiber_, nullptr));
}
}

FiberImpl(const FiberImpl&) = delete;
FiberImpl& operator=(const FiberImpl&) = delete;

FiberImpl(FiberImpl&& other) noexcept
: func_(std::move(other.func_)),
fiber_(std::exchange(other.fiber_, nullptr)),
mainFiber_(std::exchange(other.mainFiber_, nullptr)),
convertedThread_(std::exchange(other.convertedThread_, false)) {}

FiberImpl& operator=(FiberImpl&& other) noexcept {
if (this != &other) {
if (fiber_) DeleteFiber(fiber_);
func_ = std::move(other.func_);
fiber_ = std::exchange(other.fiber_, nullptr);
mainFiber_ = std::exchange(other.mainFiber_, nullptr);
convertedThread_ = std::exchange(other.convertedThread_, false);
}
return *this;
}

void activate() {
mainFiber_ = GetCurrentFiber();
// On ARM64 Windows, GetCurrentFiber() returns a garbage low address
// (e.g. 0x1E00) when the thread has not been converted to a fiber yet.
// A real fiber handle is always above 64KB (Windows minimum allocation
// granularity), so use 0x10000 as the threshold.
if (mainFiber_ == nullptr ||
mainFiber_ == INVALID_HANDLE_VALUE ||
reinterpret_cast<uintptr_t>(mainFiber_) < 0x10000) {
mainFiber_ = ConvertThreadToFiber(nullptr);
CHECK(mainFiber_ != nullptr)
<< "ConvertThreadToFiber failed: " << GetLastError();
convertedThread_ = true;
}
SwitchToFiber(fiber_);
}


void deactivate() {
DCHECK(mainFiber_ != nullptr) << "deactivate() called before activate()";
SwitchToFiber(std::exchange(mainFiber_, nullptr));
if (convertedThread_) {
ConvertFiberToThread();
convertedThread_ = false;
}
}

void* getStackPointer() const { return nullptr; }

private:
static VOID CALLBACK fiberFunc(LPVOID param) {
auto* self = static_cast<FiberImpl*>(param);
self->func_();
// func_() must not return in normal folly::fibers usage.
// If it does, switch back to avoid undefined behavior — but
// don't call deactivate() as mainFiber_ may already be cleared.
CHECK(false) << "FiberImpl::func_() returned unexpectedly";
}

folly::Function<void()> func_;
LPVOID fiber_{nullptr};
LPVOID mainFiber_{nullptr};
bool convertedThread_{false};
};

} // namespace fibers
} // namespace folly
#else

#include <boost/context/detail/fcontext.hpp>

namespace folly {
namespace fibers {
Expand Down Expand Up @@ -110,3 +220,4 @@ class FiberImpl {
};
} // namespace fibers
} // namespace folly
#endif
Loading