diff --git a/cmake/XercesSSE2.cmake b/cmake/XercesSSE2.cmake index 0f6656085..813e78de4 100644 --- a/cmake/XercesSSE2.cmake +++ b/cmake/XercesSSE2.cmake @@ -101,3 +101,52 @@ int main() { set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_SAVE}") endif() endif() + +# ARM NEON support + +option(neon "ARM NEON support" ON) +if(neon) + set(CMAKE_CXX_FLAGS_SAVE_NEON "${CMAKE_CXX_FLAGS}") + if((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)($|v)") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") + check_cxx_source_compiles(" +#include +int main() { + int32x4_t v = vdupq_n_s32(0); + (void)v; + return 0; +}" + CXX_NEEDS_mfpu_neon) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_SAVE_NEON}") + if(CXX_NEEDS_mfpu_neon) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon") + endif() + endif() + + check_cxx_source_compiles(" +#include + +int main() { + return 0; +}" + XERCES_HAVE_ARM_NEON_H) + + check_cxx_source_compiles(" +#include + +int main() { + alignas(16) int data1[4] = {1,2,3,4}; + alignas(16) int data2[4] = {5,6,7,8}; + int32x4_t a = vld1q_s32(data1); + int32x4_t b = vld1q_s32(data2); + int32x4_t c = vorrq_s32(a, b); + vst1q_s32(data1, c); + return 0; +}" + XERCES_HAVE_NEON_INTRINSIC) + + if(NOT XERCES_HAVE_NEON_INTRINSIC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_SAVE_NEON}") + endif() +endif() diff --git a/config.h.cmake.in b/config.h.cmake.in index b86c12f1f..d5cc7d82f 100644 --- a/config.h.cmake.in +++ b/config.h.cmake.in @@ -298,6 +298,12 @@ /* Define to have SSE2 instruction used at runtime */ #cmakedefine XERCES_HAVE_SSE2_INTRINSIC 1 +/* Define to 1 if you have arm_neon.h */ +#cmakedefine XERCES_HAVE_ARM_NEON_H 1 + +/* Define to have NEON instruction used at runtime */ +#cmakedefine XERCES_HAVE_NEON_INTRINSIC 1 + /* Define to 1 if we have sys/types.h */ #cmakedefine XERCES_HAVE_SYS_TYPES_H 1 diff --git a/configure.ac b/configure.ac index 2e920856f..423b99cad 100644 --- a/configure.ac +++ b/configure.ac @@ -350,6 +350,13 @@ AC_ARG_ENABLE(sse2, [have_sse2=${enableval}], [have_sse2=yes]) +# Allow the user to disable ARM NEON support. +# +AC_ARG_ENABLE(neon, + AC_HELP_STRING([--disable-neon],[disable ARM NEON optimizations]), + [have_neon=${enableval}], + [have_neon=yes]) + ###################################################### # Define some namespace-protected macros for use in the # publicly visible Xerces_autoconf_config.h file. @@ -499,6 +506,65 @@ if test "$have_sse2" = "yes"; then fi fi +if test "$have_neon" = "yes"; then + no_neon_CXXFLAGS="$CXXFLAGS" + case $host_cpu in + arm*) + save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -mfpu=neon" + AC_MSG_CHECKING([whether we need to add -mfpu=neon]) + AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[#include ]], + [[int32x4_t v = vdupq_n_s32(0); (void)v;]])], + [mfpu_neon_ok=yes], + [mfpu_neon_ok=no] + ) + AC_MSG_RESULT($mfpu_neon_ok) + if test x"$mfpu_neon_ok" = xno; then + CXXFLAGS="$save_CXXFLAGS" + fi + ;; + esac + + AC_MSG_CHECKING([for arm_neon.h usability]) + AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[#include ]], + [[]])], + [ + AC_MSG_RESULT([yes]) + AC_DEFINE_UNQUOTED([XERCES_HAVE_ARM_NEON_H], 1, [Define to 1 if you have arm_neon.h]) + ], + [ + AC_MSG_RESULT([no]) + ] + ) + + AC_MSG_CHECKING([whether the compiler has the NEON intrinsic]) + AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[#include ]], + [[ + int data1[[4]] __attribute__((aligned(16))) = {1,2,3,4}; + int data2[[4]] __attribute__((aligned(16))) = {5,6,7,8}; + int32x4_t a = vld1q_s32(data1); + int32x4_t b = vld1q_s32(data2); + int32x4_t c = vorrq_s32(a, b); + vst1q_s32(data1, c); + ]])], + [ + AC_MSG_RESULT([yes]) + neon_usable=yes + AC_DEFINE_UNQUOTED([XERCES_HAVE_NEON_INTRINSIC], 1, [Define to have NEON instruction used at runtime]) + ], + [ + AC_MSG_RESULT([no]) + neon_usable=no + ] + ) + + # Restore original CXXFLAGS if NEON is not usable. + # + if test "$neon_usable" = "no"; then + CXXFLAGS="$no_neon_CXXFLAGS" + fi +fi + AC_OUTPUT AC_MSG_NOTICE diff --git a/src/xercesc/util/PlatformUtils.cpp b/src/xercesc/util/PlatformUtils.cpp index d1f8076d1..e74cfc43d 100644 --- a/src/xercesc/util/PlatformUtils.cpp +++ b/src/xercesc/util/PlatformUtils.cpp @@ -223,7 +223,7 @@ void XMLPlatformUtils::Initialize(const char* const locale endianTest.ch = 1; fgXMLChBigEndian = (endianTest.ar[sizeof(XMLCh)-1] == 1); - // Determine if we can use SSE2 functions + // Determine if we can use SSE2 / NEON functions #if defined(XERCES_HAVE_CPUID_INTRINSIC) int CPUInfo[4]={0}; __cpuid(CPUInfo, 1); @@ -237,7 +237,7 @@ void XMLPlatformUtils::Initialize(const char* const locale fgSSE2ok = false; else fgSSE2ok = true; -#elif defined(XERCES_HAVE_SSE2_INTRINSIC) +#elif defined(XERCES_HAVE_SSE2_INTRINSIC) || defined(XERCES_HAVE_NEON_INTRINSIC) // if we cannot find out at runtime, assume the define has it right fgSSE2ok = true; #else diff --git a/src/xercesc/util/Xerces_autoconf_config.hpp.cmake.in b/src/xercesc/util/Xerces_autoconf_config.hpp.cmake.in index 8870e03e2..4618428ef 100644 --- a/src/xercesc/util/Xerces_autoconf_config.hpp.cmake.in +++ b/src/xercesc/util/Xerces_autoconf_config.hpp.cmake.in @@ -61,6 +61,8 @@ #cmakedefine XERCES_HAVE_CPUID_INTRINSIC 1 #cmakedefine XERCES_HAVE_SSE2_INTRINSIC 1 #cmakedefine XERCES_HAVE_GETCPUID 1 +#cmakedefine XERCES_HAVE_ARM_NEON_H 1 +#cmakedefine XERCES_HAVE_NEON_INTRINSIC 1 #cmakedefine XERCES_DLL_EXPORT 1 #cmakedefine XERCES_STATIC_LIBRARY 1 diff --git a/src/xercesc/util/Xerces_autoconf_config.hpp.in b/src/xercesc/util/Xerces_autoconf_config.hpp.in index 0edfdbe67..b543e6ade 100644 --- a/src/xercesc/util/Xerces_autoconf_config.hpp.in +++ b/src/xercesc/util/Xerces_autoconf_config.hpp.in @@ -61,6 +61,8 @@ #undef XERCES_HAVE_CPUID_INTRINSIC #undef XERCES_HAVE_SSE2_INTRINSIC #undef XERCES_HAVE_GETCPUID +#undef XERCES_HAVE_ARM_NEON_H +#undef XERCES_HAVE_NEON_INTRINSIC #undef XERCES_PLATFORM_EXPORT #undef XERCES_PLATFORM_IMPORT diff --git a/src/xercesc/validators/common/CMStateSet.hpp b/src/xercesc/validators/common/CMStateSet.hpp index 4cfc85faa..8a4c712aa 100644 --- a/src/xercesc/validators/common/CMStateSet.hpp +++ b/src/xercesc/validators/common/CMStateSet.hpp @@ -41,15 +41,27 @@ #if XERCES_HAVE_EMMINTRIN_H # include #endif +#if XERCES_HAVE_ARM_NEON_H +# include +#endif + +#if defined(XERCES_HAVE_SSE2_INTRINSIC) || defined(XERCES_HAVE_NEON_INTRINSIC) +# define XERCES_USE_SIMD 1 +# if defined(_MSC_VER) && !defined(__MINGW32__) +# include // _aligned_malloc +# else +# include // posix_memalign +# endif +#endif namespace XERCES_CPP_NAMESPACE { class CMStateSetEnumerator; -// This value must be 4 in order to use the SSE2 instruction set +// This value must be 4 in order to use the SIMD (SSE2/NEON) instruction set #define CMSTATE_CACHED_INT32_SIZE 4 -// This value must be a multiple of 128 in order to use the SSE2 instruction set +// This value must be a multiple of 128 in order to use the SIMD (SSE2/NEON) instruction set #define CMSTATE_BITFIELD_CHUNK 1024 #define CMSTATE_BITFIELD_INT32_SIZE (1024 / 32) @@ -171,13 +183,20 @@ public : { if(fDynamicBuffer==0) { -#ifdef XERCES_HAVE_SSE2_INTRINSIC +#ifdef XERCES_USE_SIMD if(XMLPlatformUtils::fgSSE2ok) { +# if defined(XERCES_HAVE_SSE2_INTRINSIC) __m128i xmm1 = _mm_loadu_si128(reinterpret_cast(fBits)); __m128i xmm2 = _mm_loadu_si128(reinterpret_cast(setToOr.fBits)); __m128i xmm3 = _mm_or_si128(xmm1, xmm2); // OR 4 32-bit words _mm_storeu_si128(reinterpret_cast<__m128i*>(fBits), xmm3); +# else // NEON + int32x4_t v1 = vld1q_s32(reinterpret_cast(fBits)); + int32x4_t v2 = vld1q_s32(reinterpret_cast(setToOr.fBits)); + int32x4_t v3 = vorrq_s32(v1, v2); + vst1q_s32(reinterpret_cast(fBits), v3); +# endif } else #endif @@ -211,15 +230,22 @@ public : { // otherwise, merge them XMLInt32*& mine = fDynamicBuffer->fBitArray[index]; -#ifdef XERCES_HAVE_SSE2_INTRINSIC +#ifdef XERCES_USE_SIMD if(XMLPlatformUtils::fgSSE2ok) { for(XMLSize_t subIndex = 0; subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex+=4) { +# if defined(XERCES_HAVE_SSE2_INTRINSIC) __m128i xmm1 = _mm_load_si128(reinterpret_cast(&other[subIndex])); __m128i xmm2 = _mm_load_si128(reinterpret_cast(&mine[subIndex])); __m128i xmm3 = _mm_or_si128(xmm1, xmm2); // OR 4 32-bit words _mm_store_si128(reinterpret_cast<__m128i*>(&mine[subIndex]), xmm3); +# else // NEON + int32x4_t v1 = vld1q_s32(reinterpret_cast(&other[subIndex])); + int32x4_t v2 = vld1q_s32(reinterpret_cast(&mine[subIndex])); + int32x4_t v3 = vorrq_s32(v1, v2); + vst1q_s32(reinterpret_cast(&mine[subIndex]), v3); +# endif } } else @@ -496,9 +522,23 @@ private : // ----------------------------------------------------------------------- void allocateChunk(const XMLSize_t index) { -#ifdef XERCES_HAVE_SSE2_INTRINSIC +#ifdef XERCES_USE_SIMD if(XMLPlatformUtils::fgSSE2ok) +# if defined(XERCES_HAVE_SSE2_INTRINSIC) + // SSE2 builds: use Intel's matched _mm_malloc / _mm_free pair. fDynamicBuffer->fBitArray[index]=(XMLInt32*)_mm_malloc(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32), 16); +# elif defined(_MSC_VER) && !defined(__MINGW32__) + // NEON on MSVC (Win-ARM/ARM64): use the MSVC CRT aligned allocator. + fDynamicBuffer->fBitArray[index]=(XMLInt32*)_aligned_malloc(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32), 16); +# else + // NEON on POSIX (AArch64 Linux/macOS, MinGW): POSIX aligned allocator. + { + void* p = 0; + if (posix_memalign(&p, 16, CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)) != 0) + p = 0; + fDynamicBuffer->fBitArray[index]=(XMLInt32*)p; + } +# endif else #endif fDynamicBuffer->fBitArray[index]=(XMLInt32*)fDynamicBuffer->fMemoryManager->allocate(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); @@ -506,9 +546,15 @@ private : void deallocateChunk(const XMLSize_t index) { -#ifdef XERCES_HAVE_SSE2_INTRINSIC +#ifdef XERCES_USE_SIMD if(XMLPlatformUtils::fgSSE2ok) +# if defined(XERCES_HAVE_SSE2_INTRINSIC) _mm_free(fDynamicBuffer->fBitArray[index]); +# elif defined(_MSC_VER) && !defined(__MINGW32__) + _aligned_free(fDynamicBuffer->fBitArray[index]); +# else + ::free(fDynamicBuffer->fBitArray[index]); +# endif else #endif fDynamicBuffer->fMemoryManager->deallocate(fDynamicBuffer->fBitArray[index]);