Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions cmake/XercesSSE2.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,52 @@ int main() {
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_SAVE}")
endif()
endif()

# ARM NEON support

option(neon "ARM NEON support" ON)
if(neon)
set(CMAKE_CXX_FLAGS_SAVE_NEON "${CMAKE_CXX_FLAGS}")
if((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|ARM)($|v)")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
check_cxx_source_compiles("
#include <arm_neon.h>
int main() {
int32x4_t v = vdupq_n_s32(0);
(void)v;
return 0;
}"
CXX_NEEDS_mfpu_neon)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_SAVE_NEON}")
if(CXX_NEEDS_mfpu_neon)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
endif()
endif()

check_cxx_source_compiles("
#include <arm_neon.h>

int main() {
return 0;
}"
XERCES_HAVE_ARM_NEON_H)

check_cxx_source_compiles("
#include <arm_neon.h>

int main() {
alignas(16) int data1[4] = {1,2,3,4};
alignas(16) int data2[4] = {5,6,7,8};
int32x4_t a = vld1q_s32(data1);
int32x4_t b = vld1q_s32(data2);
int32x4_t c = vorrq_s32(a, b);
vst1q_s32(data1, c);
return 0;
}"
XERCES_HAVE_NEON_INTRINSIC)

if(NOT XERCES_HAVE_NEON_INTRINSIC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_SAVE_NEON}")
endif()
endif()
6 changes: 6 additions & 0 deletions config.h.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@
/* Define to have SSE2 instruction used at runtime */
#cmakedefine XERCES_HAVE_SSE2_INTRINSIC 1

/* Define to 1 if you have arm_neon.h */
#cmakedefine XERCES_HAVE_ARM_NEON_H 1

/* Define to have NEON instruction used at runtime */
#cmakedefine XERCES_HAVE_NEON_INTRINSIC 1

/* Define to 1 if we have sys/types.h */
#cmakedefine XERCES_HAVE_SYS_TYPES_H 1

Expand Down
66 changes: 66 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,13 @@ AC_ARG_ENABLE(sse2,
[have_sse2=${enableval}],
[have_sse2=yes])

# Allow the user to disable ARM NEON support.
#
AC_ARG_ENABLE(neon,
AC_HELP_STRING([--disable-neon],[disable ARM NEON optimizations]),
[have_neon=${enableval}],
[have_neon=yes])

######################################################
# Define some namespace-protected macros for use in the
# publicly visible Xerces_autoconf_config.h file.
Expand Down Expand Up @@ -499,6 +506,65 @@ if test "$have_sse2" = "yes"; then
fi
fi

if test "$have_neon" = "yes"; then
no_neon_CXXFLAGS="$CXXFLAGS"
case $host_cpu in
arm*)
save_CXXFLAGS="$CXXFLAGS"
CXXFLAGS="$CXXFLAGS -mfpu=neon"
AC_MSG_CHECKING([whether we need to add -mfpu=neon])
AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[#include <arm_neon.h>]],
[[int32x4_t v = vdupq_n_s32(0); (void)v;]])],
[mfpu_neon_ok=yes],
[mfpu_neon_ok=no]
)
AC_MSG_RESULT($mfpu_neon_ok)
if test x"$mfpu_neon_ok" = xno; then
CXXFLAGS="$save_CXXFLAGS"
fi
;;
esac

AC_MSG_CHECKING([for arm_neon.h usability])
AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[#include <arm_neon.h>]],
[[]])],
[
AC_MSG_RESULT([yes])
AC_DEFINE_UNQUOTED([XERCES_HAVE_ARM_NEON_H], 1, [Define to 1 if you have arm_neon.h])
],
[
AC_MSG_RESULT([no])
]
)

AC_MSG_CHECKING([whether the compiler has the NEON intrinsic])
AC_COMPILE_IFELSE( [AC_LANG_PROGRAM([[#include <arm_neon.h>]],
[[
int data1[[4]] __attribute__((aligned(16))) = {1,2,3,4};
int data2[[4]] __attribute__((aligned(16))) = {5,6,7,8};
int32x4_t a = vld1q_s32(data1);
int32x4_t b = vld1q_s32(data2);
int32x4_t c = vorrq_s32(a, b);
vst1q_s32(data1, c);
]])],
[
AC_MSG_RESULT([yes])
neon_usable=yes
AC_DEFINE_UNQUOTED([XERCES_HAVE_NEON_INTRINSIC], 1, [Define to have NEON instruction used at runtime])
],
[
AC_MSG_RESULT([no])
neon_usable=no
]
)

# Restore original CXXFLAGS if NEON is not usable.
#
if test "$neon_usable" = "no"; then
CXXFLAGS="$no_neon_CXXFLAGS"
fi
fi

AC_OUTPUT

AC_MSG_NOTICE
Expand Down
4 changes: 2 additions & 2 deletions src/xercesc/util/PlatformUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ void XMLPlatformUtils::Initialize(const char* const locale
endianTest.ch = 1;
fgXMLChBigEndian = (endianTest.ar[sizeof(XMLCh)-1] == 1);

// Determine if we can use SSE2 functions
// Determine if we can use SSE2 / NEON functions
#if defined(XERCES_HAVE_CPUID_INTRINSIC)
int CPUInfo[4]={0};
__cpuid(CPUInfo, 1);
Expand All @@ -237,7 +237,7 @@ void XMLPlatformUtils::Initialize(const char* const locale
fgSSE2ok = false;
else
fgSSE2ok = true;
#elif defined(XERCES_HAVE_SSE2_INTRINSIC)
#elif defined(XERCES_HAVE_SSE2_INTRINSIC) || defined(XERCES_HAVE_NEON_INTRINSIC)
// if we cannot find out at runtime, assume the define has it right
fgSSE2ok = true;
#else
Expand Down
2 changes: 2 additions & 0 deletions src/xercesc/util/Xerces_autoconf_config.hpp.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
#cmakedefine XERCES_HAVE_CPUID_INTRINSIC 1
#cmakedefine XERCES_HAVE_SSE2_INTRINSIC 1
#cmakedefine XERCES_HAVE_GETCPUID 1
#cmakedefine XERCES_HAVE_ARM_NEON_H 1
#cmakedefine XERCES_HAVE_NEON_INTRINSIC 1

#cmakedefine XERCES_DLL_EXPORT 1
#cmakedefine XERCES_STATIC_LIBRARY 1
Expand Down
2 changes: 2 additions & 0 deletions src/xercesc/util/Xerces_autoconf_config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
#undef XERCES_HAVE_CPUID_INTRINSIC
#undef XERCES_HAVE_SSE2_INTRINSIC
#undef XERCES_HAVE_GETCPUID
#undef XERCES_HAVE_ARM_NEON_H
#undef XERCES_HAVE_NEON_INTRINSIC

#undef XERCES_PLATFORM_EXPORT
#undef XERCES_PLATFORM_IMPORT
Expand Down
58 changes: 52 additions & 6 deletions src/xercesc/validators/common/CMStateSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,27 @@
#if XERCES_HAVE_EMMINTRIN_H
# include <emmintrin.h>
#endif
#if XERCES_HAVE_ARM_NEON_H
# include <arm_neon.h>
#endif

#if defined(XERCES_HAVE_SSE2_INTRINSIC) || defined(XERCES_HAVE_NEON_INTRINSIC)
# define XERCES_USE_SIMD 1
# if defined(_MSC_VER) && !defined(__MINGW32__)
# include <malloc.h> // _aligned_malloc
# else
# include <stdlib.h> // posix_memalign
# endif
#endif

namespace XERCES_CPP_NAMESPACE {

class CMStateSetEnumerator;

// This value must be 4 in order to use the SSE2 instruction set
// This value must be 4 in order to use the SIMD (SSE2/NEON) instruction set
#define CMSTATE_CACHED_INT32_SIZE 4

// This value must be a multiple of 128 in order to use the SSE2 instruction set
// This value must be a multiple of 128 in order to use the SIMD (SSE2/NEON) instruction set
#define CMSTATE_BITFIELD_CHUNK 1024
#define CMSTATE_BITFIELD_INT32_SIZE (1024 / 32)

Expand Down Expand Up @@ -171,13 +183,20 @@ public :
{
if(fDynamicBuffer==0)
{
#ifdef XERCES_HAVE_SSE2_INTRINSIC
#ifdef XERCES_USE_SIMD
if(XMLPlatformUtils::fgSSE2ok)
{
# if defined(XERCES_HAVE_SSE2_INTRINSIC)
__m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(fBits));
__m128i xmm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(setToOr.fBits));
__m128i xmm3 = _mm_or_si128(xmm1, xmm2); // OR 4 32-bit words
_mm_storeu_si128(reinterpret_cast<__m128i*>(fBits), xmm3);
# else // NEON
int32x4_t v1 = vld1q_s32(reinterpret_cast<const int32_t*>(fBits));
int32x4_t v2 = vld1q_s32(reinterpret_cast<const int32_t*>(setToOr.fBits));
int32x4_t v3 = vorrq_s32(v1, v2);
vst1q_s32(reinterpret_cast<int32_t*>(fBits), v3);
# endif
}
else
#endif
Expand Down Expand Up @@ -211,15 +230,22 @@ public :
{
// otherwise, merge them
XMLInt32*& mine = fDynamicBuffer->fBitArray[index];
#ifdef XERCES_HAVE_SSE2_INTRINSIC
#ifdef XERCES_USE_SIMD
if(XMLPlatformUtils::fgSSE2ok)
{
for(XMLSize_t subIndex = 0; subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex+=4)
{
# if defined(XERCES_HAVE_SSE2_INTRINSIC)
__m128i xmm1 = _mm_load_si128(reinterpret_cast<const __m128i*>(&other[subIndex]));
__m128i xmm2 = _mm_load_si128(reinterpret_cast<const __m128i*>(&mine[subIndex]));
__m128i xmm3 = _mm_or_si128(xmm1, xmm2); // OR 4 32-bit words
_mm_store_si128(reinterpret_cast<__m128i*>(&mine[subIndex]), xmm3);
# else // NEON
int32x4_t v1 = vld1q_s32(reinterpret_cast<const int32_t*>(&other[subIndex]));
int32x4_t v2 = vld1q_s32(reinterpret_cast<const int32_t*>(&mine[subIndex]));
int32x4_t v3 = vorrq_s32(v1, v2);
vst1q_s32(reinterpret_cast<int32_t*>(&mine[subIndex]), v3);
# endif
}
}
else
Expand Down Expand Up @@ -496,19 +522,39 @@ private :
// -----------------------------------------------------------------------
void allocateChunk(const XMLSize_t index)
{
#ifdef XERCES_HAVE_SSE2_INTRINSIC
#ifdef XERCES_USE_SIMD
if(XMLPlatformUtils::fgSSE2ok)
# if defined(XERCES_HAVE_SSE2_INTRINSIC)
// SSE2 builds: use Intel's matched _mm_malloc / _mm_free pair.
fDynamicBuffer->fBitArray[index]=(XMLInt32*)_mm_malloc(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32), 16);
# elif defined(_MSC_VER) && !defined(__MINGW32__)
// NEON on MSVC (Win-ARM/ARM64): use the MSVC CRT aligned allocator.
fDynamicBuffer->fBitArray[index]=(XMLInt32*)_aligned_malloc(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32), 16);
# else
// NEON on POSIX (AArch64 Linux/macOS, MinGW): POSIX aligned allocator.
{
void* p = 0;
if (posix_memalign(&p, 16, CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)) != 0)
p = 0;
fDynamicBuffer->fBitArray[index]=(XMLInt32*)p;
}
# endif
else
#endif
fDynamicBuffer->fBitArray[index]=(XMLInt32*)fDynamicBuffer->fMemoryManager->allocate(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32));
}

void deallocateChunk(const XMLSize_t index)
{
#ifdef XERCES_HAVE_SSE2_INTRINSIC
#ifdef XERCES_USE_SIMD
if(XMLPlatformUtils::fgSSE2ok)
# if defined(XERCES_HAVE_SSE2_INTRINSIC)
_mm_free(fDynamicBuffer->fBitArray[index]);
# elif defined(_MSC_VER) && !defined(__MINGW32__)
_aligned_free(fDynamicBuffer->fBitArray[index]);
# else
::free(fDynamicBuffer->fBitArray[index]);
# endif
else
#endif
fDynamicBuffer->fMemoryManager->deallocate(fDynamicBuffer->fBitArray[index]);
Expand Down