From ae6c3ed2d757abc5b2b398ab3bcb00ea3f16e735 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Thu, 5 Feb 2026 22:11:25 +0000 Subject: [PATCH 1/9] Add replace benchmarks for all types --- benchmarks/src/replace.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/src/replace.cpp b/benchmarks/src/replace.cpp index 28ab1c12ddb..172525bf763 100644 --- a/benchmarks/src/replace.cpp +++ b/benchmarks/src/replace.cpp @@ -34,7 +34,8 @@ void rc(benchmark::State& state) { } } -// replace() is vectorized for 4 and 8 bytes only. +BENCHMARK(r); +BENCHMARK(r); BENCHMARK(r); BENCHMARK(r); From f3b8b1c26e6f423c1ca425142890e499a1305018 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Tue, 17 Feb 2026 14:52:17 +0000 Subject: [PATCH 2/9] Add SVE implementation of `replace` --- stl/inc/algorithm | 24 +++++- stl/inc/xutility | 2 +- stl/src/vector_algorithms.cpp | 145 +++++++++++++++++++++++++++++++--- 3 files changed, 159 insertions(+), 12 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 8d5925159b8..9ed4eb50a15 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -117,6 +117,13 @@ __declspec(noalias) bool __stdcall __std_includes_less_8u( #endif // ^^^ _VECTORIZED_INCLUDES ^^^ #if _VECTORIZED_REPLACE +#if defined(_M_ARM64) || defined(_M_ARM64EC) +__declspec(noalias) void __stdcall __std_replace_1( + void* _First, void* _Last, uint8_t _Old_val, uint8_t _New_val) noexcept; +__declspec(noalias) void __stdcall __std_replace_2( + void* _First, void* _Last, uint16_t _Old_val, uint16_t _New_val) noexcept; +#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ + // TRANSITION, DevCom-10610477 __declspec(noalias) void __stdcall __std_replace_4( void* _First, void* _Last, uint32_t _Old_val, uint32_t _New_val) noexcept; @@ -363,6 +370,13 @@ bool _Includes_vectorized( template __declspec(noalias) void _Replace_vectorized( _Ty* const _First, _Ty* const _Last, const _TVal1 _Old_val, const _TVal2 _New_val) noexcept { +#if defined(_M_ARM64) || defined(_M_ARM64EC) + if constexpr (sizeof(_Ty) == 1) { + ::__std_replace_1(_First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); + } else if constexpr (sizeof(_Ty) == 2) { + ::__std_replace_2(_First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); + } else +#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ if constexpr (sizeof(_Ty) == 4) { ::__std_replace_4( _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); @@ -471,10 +485,18 @@ _Ty* _Unique_copy_vectorized(const _Ty* const _First, const _Ty* const _Last, _T #endif // ^^^ _VECTORIZED_UNIQUE_COPY ^^^ #if _VECTORIZED_REPLACE +template +constexpr bool _Have_masked_op_for_iter = +#if defined(_M_ARM64) || defined(_M_ARM64EC) + true; +#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv + sizeof(_Iter_value_t<_Iter>) >= 4; // avx masked op compatible size +#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ + // Can we activate the vector algorithms for replace? template constexpr bool _Vector_alg_in_replace_is_safe = _Vector_alg_in_find_is_safe<_Iter, _Ty1> // can search for the value - && sizeof(_Iter_value_t<_Iter>) >= 4; // avx masked op compatible size + && _Have_masked_op_for_iter<_Iter>; // Can we activate the vector algorithms for ranges::replace? template diff --git a/stl/inc/xutility b/stl/inc/xutility index d023baae805..51411bf9b10 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -89,7 +89,7 @@ _STL_DISABLE_CLANG_WARNINGS #define _VECTORIZED_MISMATCH _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC #define _VECTORIZED_REMOVE _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC #define _VECTORIZED_REMOVE_COPY _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC -#define _VECTORIZED_REPLACE _VECTORIZED_FOR_X64_X86 +#define _VECTORIZED_REPLACE _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC #define _VECTORIZED_REPLACE_COPY _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC #define _VECTORIZED_REVERSE _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC #define _VECTORIZED_REVERSE_COPY _VECTORIZED_FOR_X64_X86_ARM64_ARM64EC diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index a3d5dc8afba..f2bf52ce494 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -13,6 +13,7 @@ #if defined(_M_ARM64) || defined(_M_ARM64EC) #include +#include #include #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv @@ -9430,6 +9431,116 @@ __declspec(noalias) size_t __stdcall __std_mismatch_8( namespace { namespace _Replacing { #if defined(_M_ARM64) || defined(_M_ARM64EC) + struct _Traits_1_sve { + static svuint8_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + return svld1(_Pred, static_cast(_Ptr)); + } + + static svuint8_t _Set(const uint8_t _Val) noexcept { + return svdup_n_u8(_Val); + } + + static svbool_t _Cmp(const svbool_t _Pred, const svuint8_t _Lhs, const svuint8_t _Rhs) noexcept { + return svcmpeq(_Pred, _Lhs, _Rhs); + } + + static void _Store(const svbool_t _Pred, void* const _Ptr, const svuint8_t _Val) noexcept { + svst1(_Pred, static_cast(_Ptr), _Val); + } + }; + + struct _Traits_2_sve { + static svuint16_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + return svld1(_Pred, static_cast(_Ptr)); + } + + static svuint16_t _Set(const uint16_t _Val) noexcept { + return svdup_n_u16(_Val); + } + + static svbool_t _Cmp(const svbool_t _Pred, const svuint16_t _Lhs, const svuint16_t _Rhs) noexcept { + return svcmpeq(_Pred, _Lhs, _Rhs); + } + + static void _Store(const svbool_t _Pred, void* const _Ptr, const svuint16_t _Val) noexcept { + svst1(_Pred, static_cast(_Ptr), _Val); + } + }; + + struct _Traits_4_sve { + static svuint32_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + return svld1(_Pred, static_cast(_Ptr)); + } + + static svuint32_t _Set(const uint32_t _Val) noexcept { + return svdup_n_u32(_Val); + } + + static svbool_t _Cmp(const svbool_t _Pred, const svuint32_t _Lhs, const svuint32_t _Rhs) noexcept { + return svcmpeq(_Pred, _Lhs, _Rhs); + } + + static void _Store(const svbool_t _Pred, void* const _Ptr, const svuint32_t _Val) noexcept { + svst1(_Pred, static_cast(_Ptr), _Val); + } + }; + + struct _Traits_8_sve { + static svuint64_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + return svld1(_Pred, static_cast(_Ptr)); + } + + static svuint64_t _Set(const uint64_t _Val) noexcept { + return svdup_n_u64(_Val); + } + + static svbool_t _Cmp(const svbool_t _Pred, const svuint64_t _Lhs, const svuint64_t _Rhs) noexcept { + return svcmpeq(_Pred, _Lhs, _Rhs); + } + + static void _Store(const svbool_t _Pred, void* const _Ptr, const svuint64_t _Val) noexcept { + svst1(_Pred, static_cast(_Ptr), _Val); + } + }; + + template + __declspec(noalias) void __stdcall _Replace_impl( + void* _First, const void* const _Last, const _Ty _Old_val, const _Ty _New_val) noexcept { + + if (_Use_FEAT_SVE()) { + const size_t _Sve_vl = svcntb(); + const size_t _Size_bytes = _Byte_length(_First, _Last); + const size_t _Full_vl_bytes = _Size_bytes & ~size_t{_Sve_vl - 1}; + + const void* _Stop_at = _First; + _Advance_bytes(_Stop_at, _Full_vl_bytes); + + const auto _Comparand = _Traits::_Set(_Old_val); + const auto _Replacement = _Traits::_Set(_New_val); + + const auto _True = svptrue_b8(); + while (_First != _Stop_at) { + const auto _Data = _Traits::_Load(_True, _First); + const auto _Mask = _Traits::_Cmp(_True, _Data, _Comparand); + _Traits::_Store(_Mask, _First, _Replacement); + _Advance_bytes(_First, _Sve_vl); + } + + if (const size_t _Tail_length = _Size_bytes & size_t{_Sve_vl - 1}; _Tail_length != 0) { + auto _Tail_mask = svwhilelt_b8(size_t{0}, _Tail_length); + const auto _Data = _Traits::_Load(_Tail_mask, _First); + const auto _Mask = _Traits::_Cmp(_Tail_mask, _Data, _Comparand); + _Traits::_Store(_Mask, _First, _Replacement); + } + } else { + for (auto _Cur = static_cast<_Ty*>(_First); _Cur != _Last; ++_Cur) { + if (*_Cur == _Old_val) { + *_Cur = _New_val; + } + } + } + } + template __declspec(noalias) void __stdcall _Replace_copy_impl( const void* _First, const void* const _Last, void* _Dest, const _Ty _Old_val, const _Ty _New_val) noexcept { @@ -9568,10 +9679,29 @@ namespace { extern "C" { -#ifndef _M_ARM64 +#if defined(_M_ARM64) || defined(_M_ARM64EC) +__declspec(noalias) void __stdcall __std_replace_1( + void* const _First, const void* const _Last, const uint8_t _Old_val, const uint8_t _New_val) noexcept { + _Replacing::_Replace_impl<_Replacing::_Traits_1_sve>(_First, _Last, _Old_val, _New_val); +} + +__declspec(noalias) void __stdcall __std_replace_2( + void* const _First, const void* const _Last, const uint16_t _Old_val, const uint16_t _New_val) noexcept { + _Replacing::_Replace_impl<_Replacing::_Traits_2_sve>(_First, _Last, _Old_val, _New_val); +} + +__declspec(noalias) void __stdcall __std_replace_4( + void* const _First, const void* const _Last, const uint32_t _Old_val, const uint32_t _New_val) noexcept { + _Replacing::_Replace_impl<_Replacing::_Traits_4_sve>(_First, _Last, _Old_val, _New_val); +} + +__declspec(noalias) void __stdcall __std_replace_8( + void* const _First, const void* const _Last, const uint64_t _Old_val, const uint64_t _New_val) noexcept { + _Replacing::_Replace_impl<_Replacing::_Traits_8_sve>(_First, _Last, _Old_val, _New_val); +} +#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv __declspec(noalias) void __stdcall __std_replace_4( void* _First, void* const _Last, const uint32_t _Old_val, const uint32_t _New_val) noexcept { -#ifndef _M_ARM64EC if (_Use_avx2()) { const __m256i _Comparand = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(_Old_val)); const __m256i _Replacement = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(_New_val)); @@ -9596,9 +9726,7 @@ __declspec(noalias) void __stdcall __std_replace_4( } _mm256_zeroupper(); // TRANSITION, DevCom-10331414 - } else -#endif // ^^^ !defined(_M_ARM64EC) ^^^ - { + } else { for (auto _Cur = reinterpret_cast(_First); _Cur != _Last; ++_Cur) { if (*_Cur == _Old_val) { *_Cur = _New_val; @@ -9609,7 +9737,6 @@ __declspec(noalias) void __stdcall __std_replace_4( __declspec(noalias) void __stdcall __std_replace_8( void* _First, void* const _Last, const uint64_t _Old_val, const uint64_t _New_val) noexcept { -#ifndef _M_ARM64EC if (_Use_avx2()) { #ifdef _WIN64 const __m256i _Comparand = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(_Old_val)); @@ -9639,9 +9766,7 @@ __declspec(noalias) void __stdcall __std_replace_8( } _mm256_zeroupper(); // TRANSITION, DevCom-10331414 - } else -#endif // ^^^ !defined(_M_ARM64EC) ^^^ - { + } else { for (auto _Cur = reinterpret_cast(_First); _Cur != _Last; ++_Cur) { if (*_Cur == _Old_val) { *_Cur = _New_val; @@ -9649,7 +9774,7 @@ __declspec(noalias) void __stdcall __std_replace_8( } } } -#endif // ^^^ !defined(_M_ARM64) ^^^ +#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ __declspec(noalias) void __stdcall __std_replace_copy_1(const void* const _First, const void* const _Last, void* const _Dest, const uint8_t _Old_val, const uint8_t _New_val) noexcept { From 7437ad01350c2961857de781f11e6bd25bf03106 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 2 Apr 2026 09:29:27 -0700 Subject: [PATCH 3/9] Add braces, fix endif comments, clang-format. --- stl/inc/algorithm | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 613ff9773b4..2e7e724c724 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -126,7 +126,7 @@ __declspec(noalias) void __stdcall __std_replace_1( void* _First, void* _Last, uint8_t _Old_val, uint8_t _New_val) noexcept; __declspec(noalias) void __stdcall __std_replace_2( void* _First, void* _Last, uint16_t _Old_val, uint16_t _New_val) noexcept; -#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ +#endif // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) ^^^ // TRANSITION, DevCom-10610477 __declspec(noalias) void __stdcall __std_replace_4( @@ -392,19 +392,23 @@ __declspec(noalias) void _Replace_vectorized( _Ty* const _First, _Ty* const _Last, const _TVal1 _Old_val, const _TVal2 _New_val) noexcept { #if defined(_M_ARM64) || defined(_M_ARM64EC) if constexpr (sizeof(_Ty) == 1) { - ::__std_replace_1(_First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); + ::__std_replace_1( + _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); } else if constexpr (sizeof(_Ty) == 2) { - ::__std_replace_2(_First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); + ::__std_replace_2( + _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); } else -#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ - if constexpr (sizeof(_Ty) == 4) { - ::__std_replace_4( - _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); - } else if constexpr (sizeof(_Ty) == 8) { - ::__std_replace_8( - _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); - } else { - static_assert(false, "unexpected size"); +#endif // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) ^^^ + { + if constexpr (sizeof(_Ty) == 4) { + ::__std_replace_4( + _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); + } else if constexpr (sizeof(_Ty) == 8) { + ::__std_replace_8( + _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); + } else { + static_assert(false, "unexpected size"); + } } } #endif // ^^^ _VECTORIZED_REPLACE ^^^ From 2adfc55982bd754d5e6b9f2f73493a7f11b8e445 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Thu, 2 Apr 2026 09:47:03 -0700 Subject: [PATCH 4/9] Further reduce test coverage for ARM64EC fallbacks. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 264b0c1926c..68467094e5b 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -1961,11 +1961,6 @@ int main() { test_min_max_element(gen); test_min_max_element_pointers(gen); - - test_replace(gen); - test_replace(gen); - test_replace(gen); - test_replace(gen); #else // ^^^ defined(_CALL_ALL_X64_VECTOR_ALGORITHMS_ON_ARM64EC) / normal test coverage vvv test_vector_algorithms(gen); test_various_containers(); From 9c451104312389163f9e96e5f8c3b8511fc8cfe6 Mon Sep 17 00:00:00 2001 From: Hari Limaye Date: Thu, 2 Apr 2026 17:19:00 +0000 Subject: [PATCH 5/9] Add _VECTORIZED_REPLACE_1_2 macro --- stl/inc/algorithm | 20 ++++++++++---------- stl/inc/xutility | 6 ++++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/stl/inc/algorithm b/stl/inc/algorithm index 2e7e724c724..628d6d1db6e 100644 --- a/stl/inc/algorithm +++ b/stl/inc/algorithm @@ -121,12 +121,12 @@ __declspec(noalias) bool __stdcall __std_includes_less_8u( #endif // ^^^ _VECTORIZED_INCLUDES ^^^ #if _VECTORIZED_REPLACE -#if defined(_M_ARM64) || defined(_M_ARM64EC) +#if _VECTORIZED_REPLACE_1_2 __declspec(noalias) void __stdcall __std_replace_1( void* _First, void* _Last, uint8_t _Old_val, uint8_t _New_val) noexcept; __declspec(noalias) void __stdcall __std_replace_2( void* _First, void* _Last, uint16_t _Old_val, uint16_t _New_val) noexcept; -#endif // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) ^^^ +#endif // ^^^ _VECTORIZED_REPLACE_1_2 ^^^ // TRANSITION, DevCom-10610477 __declspec(noalias) void __stdcall __std_replace_4( @@ -390,7 +390,7 @@ bool _Includes_vectorized( template __declspec(noalias) void _Replace_vectorized( _Ty* const _First, _Ty* const _Last, const _TVal1 _Old_val, const _TVal2 _New_val) noexcept { -#if defined(_M_ARM64) || defined(_M_ARM64EC) +#if _VECTORIZED_REPLACE_1_2 if constexpr (sizeof(_Ty) == 1) { ::__std_replace_1( _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); @@ -398,7 +398,7 @@ __declspec(noalias) void _Replace_vectorized( ::__std_replace_2( _First, _Last, _STD _Find_arg_cast(_Old_val), _STD _Find_arg_cast(_New_val)); } else -#endif // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) ^^^ +#endif // ^^^ _VECTORIZED_REPLACE_1_2 ^^^ { if constexpr (sizeof(_Ty) == 4) { ::__std_replace_4( @@ -509,13 +509,13 @@ _Ty* _Unique_copy_vectorized(const _Ty* const _First, const _Ty* const _Last, _T #endif // ^^^ _VECTORIZED_UNIQUE_COPY ^^^ #if _VECTORIZED_REPLACE +#if _VECTORIZED_REPLACE_1_2 template -constexpr bool _Have_masked_op_for_iter = -#if defined(_M_ARM64) || defined(_M_ARM64EC) - true; -#else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv - sizeof(_Iter_value_t<_Iter>) >= 4; // avx masked op compatible size -#endif // ^^^ !defined(_M_ARM64) && !defined(_M_ARM64EC) ^^^ +constexpr bool _Have_masked_op_for_iter = true; +#else // ^^^ _VECTORIZED_REPLACE_1_2 / !_VECTORIZED_REPLACE_1_2 vvv +template +constexpr bool _Have_masked_op_for_iter = sizeof(_Iter_value_t<_Iter>) >= 4; // avx masked op compatible size +#endif // ^^^ !_VECTORIZED_REPLACE_1_2 ^^^ // Can we activate the vector algorithms for replace? template diff --git a/stl/inc/xutility b/stl/inc/xutility index 276710cd0f2..647c14ce8d9 100644 --- a/stl/inc/xutility +++ b/stl/inc/xutility @@ -104,6 +104,12 @@ _STL_DISABLE_CLANG_WARNINGS // as this does not improve performance over the scalar code. #define _VECTORIZED_MINMAX_ELEMENT_64BIT_INT _VECTORIZED_FOR_X64_X86 +#if defined(_M_ARM64) || defined(_M_ARM64EC) +#define _VECTORIZED_REPLACE_1_2 1 +#else +#define _VECTORIZED_REPLACE_1_2 0 +#endif + #ifndef _USE_STD_VECTOR_FLOATING_ALGORITHMS #if _USE_STD_VECTOR_ALGORITHMS && !defined(_M_FP_EXCEPT) #define _USE_STD_VECTOR_FLOATING_ALGORITHMS 1 From 9b3bdd5392cfcf1685f08c0f8f394056e7289eb2 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 3 Apr 2026 07:45:59 -0700 Subject: [PATCH 6/9] Remove const to match declarations. --- stl/src/vector_algorithms.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 9ff831e2389..14c9e752401 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -9682,7 +9682,7 @@ namespace { template __declspec(noalias) void __stdcall _Replace_impl( - void* _First, const void* const _Last, const _Ty _Old_val, const _Ty _New_val) noexcept { + void* _First, void* const _Last, const _Ty _Old_val, const _Ty _New_val) noexcept { if (_Use_FEAT_SVE()) { const size_t _Sve_vl = svcntb(); @@ -9858,22 +9858,22 @@ extern "C" { #if defined(_M_ARM64) || defined(_M_ARM64EC) __declspec(noalias) void __stdcall __std_replace_1( - void* const _First, const void* const _Last, const uint8_t _Old_val, const uint8_t _New_val) noexcept { + void* const _First, void* const _Last, const uint8_t _Old_val, const uint8_t _New_val) noexcept { _Replacing::_Replace_impl<_Replacing::_Traits_1_sve>(_First, _Last, _Old_val, _New_val); } __declspec(noalias) void __stdcall __std_replace_2( - void* const _First, const void* const _Last, const uint16_t _Old_val, const uint16_t _New_val) noexcept { + void* const _First, void* const _Last, const uint16_t _Old_val, const uint16_t _New_val) noexcept { _Replacing::_Replace_impl<_Replacing::_Traits_2_sve>(_First, _Last, _Old_val, _New_val); } __declspec(noalias) void __stdcall __std_replace_4( - void* const _First, const void* const _Last, const uint32_t _Old_val, const uint32_t _New_val) noexcept { + void* const _First, void* const _Last, const uint32_t _Old_val, const uint32_t _New_val) noexcept { _Replacing::_Replace_impl<_Replacing::_Traits_4_sve>(_First, _Last, _Old_val, _New_val); } __declspec(noalias) void __stdcall __std_replace_8( - void* const _First, const void* const _Last, const uint64_t _Old_val, const uint64_t _New_val) noexcept { + void* const _First, void* const _Last, const uint64_t _Old_val, const uint64_t _New_val) noexcept { _Replacing::_Replace_impl<_Replacing::_Traits_8_sve>(_First, _Last, _Old_val, _New_val); } #else // ^^^ defined(_M_ARM64) || defined(_M_ARM64EC) / !defined(_M_ARM64) && !defined(_M_ARM64EC) vvv From ae5d1f375f53a2de4945f09d429938a895ade261 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Fri, 3 Apr 2026 07:50:36 -0700 Subject: [PATCH 7/9] Add const. --- stl/src/vector_algorithms.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index 14c9e752401..2e51ccee583 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -9609,7 +9609,7 @@ namespace { namespace _Replacing { #if defined(_M_ARM64) || defined(_M_ARM64EC) struct _Traits_1_sve { - static svuint8_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + static svuint8_t _Load(const svbool_t _Pred, const void* const _Ptr) noexcept { return svld1(_Pred, static_cast(_Ptr)); } @@ -9627,7 +9627,7 @@ namespace { }; struct _Traits_2_sve { - static svuint16_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + static svuint16_t _Load(const svbool_t _Pred, const void* const _Ptr) noexcept { return svld1(_Pred, static_cast(_Ptr)); } @@ -9645,7 +9645,7 @@ namespace { }; struct _Traits_4_sve { - static svuint32_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + static svuint32_t _Load(const svbool_t _Pred, const void* const _Ptr) noexcept { return svld1(_Pred, static_cast(_Ptr)); } @@ -9663,7 +9663,7 @@ namespace { }; struct _Traits_8_sve { - static svuint64_t _Load(svbool_t _Pred, const void* const _Ptr) noexcept { + static svuint64_t _Load(const svbool_t _Pred, const void* const _Ptr) noexcept { return svld1(_Pred, static_cast(_Ptr)); } @@ -9704,9 +9704,9 @@ namespace { } if (const size_t _Tail_length = _Size_bytes & size_t{_Sve_vl - 1}; _Tail_length != 0) { - auto _Tail_mask = svwhilelt_b8(size_t{0}, _Tail_length); - const auto _Data = _Traits::_Load(_Tail_mask, _First); - const auto _Mask = _Traits::_Cmp(_Tail_mask, _Data, _Comparand); + const auto _Tail_mask = svwhilelt_b8(size_t{0}, _Tail_length); + const auto _Data = _Traits::_Load(_Tail_mask, _First); + const auto _Mask = _Traits::_Cmp(_Tail_mask, _Data, _Comparand); _Traits::_Store(_Mask, _First, _Replacement); } } else { From 4bef3d4996e0088f7cc9c4e3542123d62b21ef90 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 22 Jun 2026 09:36:06 -0700 Subject: [PATCH 8/9] Add a comment about the SVE vector length, citing the Arm ARM. --- stl/src/vector_algorithms.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stl/src/vector_algorithms.cpp b/stl/src/vector_algorithms.cpp index ad50850c61b..207d8f8a11e 100644 --- a/stl/src/vector_algorithms.cpp +++ b/stl/src/vector_algorithms.cpp @@ -9690,6 +9690,9 @@ namespace { void* _First, void* const _Last, const _Ty _Old_val, const _Ty _New_val) noexcept { if (_Use_FEAT_SVE()) { + // Arm Architecture Reference Manual for A-profile architecture, + // B1.4.2 "Configurable SVE vector lengths": + // "The architecturally defined SVL set is all powers of two from 128 to 2048 bits inclusive." const size_t _Sve_vl = svcntb(); const size_t _Size_bytes = _Byte_length(_First, _Last); const size_t _Full_vl_bytes = _Size_bytes & ~size_t{_Sve_vl - 1}; From 2e427511c43ae019f4202cf2c712b3ddf6a5004f Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 22 Jun 2026 09:45:12 -0700 Subject: [PATCH 9/9] Update test coverage to enable 1 and 2 bytes for ARM64/ARM64EC. --- tests/std/tests/VSO_0000000_vector_algorithms/test.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp index 68467094e5b..77f9abe13b4 100644 --- a/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp +++ b/tests/std/tests/VSO_0000000_vector_algorithms/test.cpp @@ -767,8 +767,13 @@ void test_case_replace_copy(const vector& input, vector& out_expected, vec template void test_replace(mt19937_64& gen) { - // replace() is vectorized for 4 and 8 bytes only. +#if defined(_M_ARM64) || defined(_M_ARM64EC) + // For ARM64/ARM64EC, replace() is always vectorized. + constexpr bool replace_is_vectorized = true; +#else + // For x64/x86, replace() is vectorized for 4 and 8 bytes only. constexpr bool replace_is_vectorized = sizeof(T) >= 4; +#endif using TD = conditional_t; uniform_int_distribution dis(0, 9);