Skip to content

Commit a0b659e

Browse files
committed
fix: revert old pattern scan code
1 parent 104f773 commit a0b659e

3 files changed

Lines changed: 163 additions & 39 deletions

File tree

CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,15 @@ set(SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src")
3434

3535
include(cmake/platform/shared.cmake)
3636

37+
detect_system()
38+
detect_compiler()
39+
40+
if(DYNLIBUTILS_CPU_ARCH_ARM64 OR DYNLIBUTILS_CPU_ARCH_ARM32)
41+
set(IS_ARM TRUE)
42+
else()
43+
set(IS_ARM FALSE)
44+
endif()
45+
3746
if(WINDOWS)
3847
include(cmake/platform/windows.cmake)
3948
elseif(LINUX)
@@ -47,6 +56,8 @@ set(COMPILE_DEFINITIONS
4756
DYNLIBUTILS_PLATFORM_LINUX=$<BOOL:${LINUX}>
4857
DYNLIBUTILS_PLATFORM_WINDOWS=$<BOOL:${WIN32}>
4958
DYNLIBUTILS_PLATFORM_APPLE=$<BOOL:${APPLE}>
59+
DYNLIBUTILS_ARCH_ARM=$<BOOL:${IS_ARM}>
60+
DYNLIBUTILS_ARCH_BITS=$<IF:$<EQUAL:${CMAKE_SIZEOF_VOID_P},8>,64,32>
5061
)
5162

5263
set(INCLUDE_DIRS

cmake/platform/shared.cmake

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,104 @@ set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING
2222
"Only do Release and Debug"
2323
FORCE
2424
)
25+
26+
function(detect_system)
27+
message(STATUS "CMake Version: ${CMAKE_VERSION}")
28+
message(STATUS "CMake System Name: ${CMAKE_SYSTEM_NAME}")
29+
message(STATUS "CMake System Processor: ${CMAKE_SYSTEM_PROCESSOR}")
30+
31+
string(TOLOWER "${CMAKE_SYSTEM_NAME}" DYNLIBUTILS_SYSTEM)
32+
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" DYNLIBUTILS_ARCH)
33+
34+
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
35+
set(LINUX TRUE PARENT_SCOPE)
36+
37+
# Check if /etc/os-release exists
38+
if(EXISTS "/etc/os-release")
39+
file(STRINGS "/etc/os-release" OS_RELEASE_ID_LINE REGEX "^ID=")
40+
string(REPLACE "ID=" "" OS_ID "${OS_RELEASE_ID_LINE}")
41+
string(REGEX REPLACE "^\"(.*)\"$" "\\1" OS_ID "${OS_ID}")
42+
string(TOLOWER "${OS_ID}" DYNLIBUTILS_SYSTEM)
43+
message(STATUS "Detected Linux distribution: ${OS_ID}")
44+
else()
45+
message(WARNING "Cannot detect Linux distribution: /etc/os-release not found")
46+
endif()
47+
endif()
48+
49+
if(UNIX OR CYGWIN)
50+
set(POSIX TRUE PARENT_SCOPE)
51+
endif()
52+
53+
if(APPLE AND NOT "${CMAKE_OSX_ARCHITECTURES}" STREQUAL "")
54+
list(LENGTH CMAKE_OSX_ARCHITECTURES ARCH_COUNT)
55+
if(ARCH_COUNT GREATER 1)
56+
message(FATAL_ERROR "More than one architecture is not supported!")
57+
endif()
58+
list(GET CMAKE_OSX_ARCHITECTURES 0 ARCH_NAME)
59+
if(ARCH_NAME STREQUAL "x86_64")
60+
message(STATUS "Building x86_64 MacOS binaries")
61+
set(DYNLIBUTILS_ARCH "x64")
62+
set(DYNLIBUTILS_CPU_ARCH_X64 TRUE PARENT_SCOPE)
63+
elseif(ARCH_NAME STREQUAL "arm64")
64+
message(STATUS "Building ARM64 MacOS binaries")
65+
set(DYNLIBUTILS_ARCH "arm64")
66+
set(DYNLIBUTILS_CPU_ARCH_ARM64 TRUE PARENT_SCOPE)
67+
else()
68+
message(FATAL_ERROR "Unknown architecture: ${CMAKE_OSX_ARCHITECTURES}")
69+
endif()
70+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$")
71+
message(STATUS "Building x86 binaries")
72+
set(DYNLIBUTILS_ARCH "x86")
73+
set(DYNLIBUTILS_CPU_ARCH_X86 TRUE PARENT_SCOPE)
74+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64") # must be before arm64
75+
message(STATUS "Building x86_64 binaries")
76+
set(DYNLIBUTILS_ARCH "x64")
77+
set(DYNLIBUTILS_CPU_ARCH_X64 TRUE PARENT_SCOPE)
78+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv8.?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64")
79+
message(STATUS "Building ARM64 binaries")
80+
set(DYNLIBUTILS_ARCH "arm64")
81+
set(DYNLIBUTILS_CPU_ARCH_ARM64 TRUE PARENT_SCOPE)
82+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
83+
message(STATUS "Building ARM32 binaries")
84+
set(DYNLIBUTILS_ARCH "arm32")
85+
set(DYNLIBUTILS_CPU_ARCH_ARM32 TRUE PARENT_SCOPE)
86+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
87+
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
88+
message(STATUS "Building RISC-V 32 binaries")
89+
set(DYNLIBUTILS_ARCH "riscv32")
90+
set(DYNLIBUTILS_CPU_ARCH_RISCV32 TRUE PARENT_SCOPE)
91+
elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
92+
message(STATUS "Building RISC-V 64 binaries")
93+
set(DYNLIBUTILS_ARCH "riscv64")
94+
set(DYNLIBUTILS_CPU_ARCH_RISCV64 TRUE PARENT_SCOPE)
95+
else()
96+
message(FATAL_ERROR "Unknown sizeof void: ${CMAKE_SIZEOF_VOID_P}")
97+
endif()
98+
else()
99+
message(FATAL_ERROR "Unknown architecture: ${CMAKE_SYSTEM_PROCESSOR}")
100+
endif()
101+
102+
set(DYNLIBUTILS_PLATFORM "${DYNLIBUTILS_SYSTEM}_${DYNLIBUTILS_ARCH}" PARENT_SCOPE)
103+
message(STATUS "Building for ${DYNLIBUTILS_SYSTEM}_${DYNLIBUTILS_ARCH}")
104+
endfunction()
105+
106+
function(detect_compiler)
107+
if(MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
108+
message(STATUS "Building with Clang-CL")
109+
set(DYNLIBUTILS_COMPILER_CLANG_CL TRUE PARENT_SCOPE)
110+
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
111+
message(STATUS "Building with Clang/LLVM")
112+
set(DYNLIBUTILS_COMPILER_CLANG TRUE PARENT_SCOPE)
113+
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
114+
message(STATUS "Building with GNU GCC")
115+
set(DYNLIBUTILS_COMPILER_GCC TRUE PARENT_SCOPE)
116+
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
117+
message(STATUS "Building with Intel")
118+
set(DYNLIBUTILS_COMPILER_INTEL TRUE PARENT_SCOPE)
119+
elseif(MSVC)
120+
message(STATUS "Building with MSVC")
121+
set(DYNLIBUTILS_COMPILER_MSVC TRUE PARENT_SCOPE)
122+
else()
123+
message(FATAL_ERROR "Unknown compiler: ${CMAKE_CXX_COMPILER_ID}")
124+
endif()
125+
endfunction()

include/dynlibutils/module.hpp

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <array>
1717
#include <cassert>
18+
#include <cmath>
1819
#include <string>
1920
#include <string_view>
2021
#include <type_traits>
@@ -439,7 +440,7 @@ class CAssemblyModule : public CMemory
439440
template<std::size_t SIZE = (s_nDefaultPatternSize - 1) / 2>
440441
inline CMemory FindPattern(const CMemoryView<std::uint8_t> pPatternMem, const std::string_view svMask, const CMemory pStartAddress, const Section_t* pModuleSection) const
441442
{
442-
volatile const auto* pPattern = pPatternMem.RCastView();
443+
const auto* pPattern = pPatternMem.RCastView();
443444

444445
CCache sKey(pPattern, svMask.size(), pStartAddress, pModuleSection);
445446
if (auto pAddr = GetAddress(sKey))
@@ -468,65 +469,76 @@ class CAssemblyModule : public CMemory
468469
pData = start;
469470
}
470471

471-
constexpr auto kSimdBytes = sizeof(__m128i); // 128 bits = 16 bytes.
472-
constexpr auto kMaxSimdBlocks = std::max<std::size_t>(1u, std::min<std::size_t>(SIZE, s_nMaxSimdBlocks));
472+
#if !DYNLIBUTILS_ARCH_ARM
473+
std::array<int, 64> masks = {};// 64*16 = enough masks for 1024 bytes.
474+
auto numMasks = static_cast<std::uint8_t>(std::ceil(static_cast<float>(patternSize) / 16.f));
473475

474-
const std::size_t numBlocks = (patternSize + (kSimdBytes - 1)) / kSimdBytes;
475-
476-
std::uint16_t bitMasks[kMaxSimdBlocks] = {};
477-
__m128i patternChunks[kMaxSimdBlocks];
478-
479-
for (std::size_t n = 0; n < numBlocks; ++n)
476+
for (std::uint8_t i = 0; i < numMasks; ++i)
480477
{
481-
const std::size_t offset = n * kSimdBytes;
482-
patternChunks[n] = _mm_loadu_si128(reinterpret_cast<const __m128i*>(const_cast<std::uint8_t*>(pPattern) + offset));
483-
484-
for (std::size_t j = 0; j < kSimdBytes; ++j)
478+
for (std::int8_t j = static_cast<std::int8_t>(std::min<std::size_t>(patternSize - i * 16, 16)) - 1; j >= 0; --j)
485479
{
486-
const std::size_t idx = offset + j;
487-
if (idx >= patternSize)
488-
break;
489-
490-
if (svMask[idx] == 'x')
491-
bitMasks[n] |= (1u << j);
480+
if (svMask[static_cast<std::size_t>(i * 16 + j)] == 'x')
481+
{
482+
masks[i] |= 1 << j;
483+
}
492484
}
493485
}
494486

495-
// How far ahead (in bytes) to prefetch during scanning.
496-
// This is calculated based on how many SIMD blocks (16 bytes each) will be read
497-
// in the current pattern match attempt.
498-
//
499-
// Helps reduce cache misses during large linear memory scans by hinting the CPU
500-
// to load the next block of memory before it is needed.
501-
const std::size_t lookAhead = numBlocks * kSimdBytes;
502-
503-
for (; pData <= pEnd; ++pData)
487+
const __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pPattern));
488+
__m128i xmm2, xmm3, msks;
489+
for (; pData != pEnd; _mm_prefetch(reinterpret_cast<const char*>(++pData + 64), _MM_HINT_NTA))
504490
{
505-
if (static_cast<std::size_t>(pEnd - pData) > lookAhead)
506-
_mm_prefetch(reinterpret_cast<const char*>(pData + lookAhead), _MM_HINT_NTA);
491+
xmm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pData));
492+
msks = _mm_cmpeq_epi8(xmm1, xmm2);
493+
if ((_mm_movemask_epi8(msks) & masks[0]) == masks[0])
494+
{
495+
bool found = true;
496+
for (uint8_t i = 1; i < numMasks; ++i)
497+
{
498+
xmm2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((pData + i * 16)));
499+
xmm3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((pPattern + i * 16)));
500+
msks = _mm_cmpeq_epi8(xmm2, xmm3);
501+
if ((_mm_movemask_epi8(msks) & masks[i]) != masks[i])
502+
{
503+
found = false;
504+
break;
505+
}
506+
}
507507

508-
bool bFound = true;
508+
if (found)
509+
{
510+
UniqueLock_t lock(m_mutex);
511+
m_mapCached[std::move(sKey)] = pData;
512+
return pData;
513+
}
514+
}
515+
}
516+
#else
517+
for (; pData != pEnd; ++pData)
518+
{
519+
bool found = false;
509520

510-
for (std::size_t n = 0; n < numBlocks; ++n)
521+
for (size_t i = 0; i < maskLen; ++i)
511522
{
512-
const __m128i dataChunk = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pData + n * kSimdBytes));
513-
const __m128i cmp = _mm_cmpeq_epi8(dataChunk, patternChunks[n]);
514-
const int mask = _mm_movemask_epi8(cmp);
515-
516-
if ((mask & bitMasks[n]) != bitMasks[n])
523+
if (mask[i] == 'x' || pPattern[i] == *(pData + i))
524+
{
525+
found = true;
526+
}
527+
else
517528
{
518-
bFound = false;
529+
found = false;
519530
break;
520531
}
521532
}
522533

523-
if (bFound)
534+
if (found)
524535
{
525536
UniqueLock_t lock(m_mutex);
526537
m_mapCached[std::move(sKey)] = pData;
527538
return pData;
528539
}
529540
}
541+
#endif // !DYNLIBUTILS_ARCH_ARM
530542

531543
return DYNLIB_INVALID_MEMORY;
532544
}

0 commit comments

Comments
 (0)