Last active
September 21, 2023 21:04
-
-
Save bkietz/345c0a76c010e7b23c3b9d9c4fc11e1f to your computer and use it in GitHub Desktop.
C++17 string search with SSE4.2's `_mm_cmpistrm`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if defined(__SSE4_2__) | |
#if defined(__has_builtin) | |
#if __has_builtin(__builtin_ctz) | |
#include <immintrin.h> | |
#include <cstring> | |
#define ARROW_TOO_SIMD_FIND_FIRST_OF | |
#endif | |
#endif | |
#endif | |
namespace arrow_too { | |
/// Use like | |
/// | |
/// find_first.of<' ', '\t', '\n', '\r', '\0'>("hello world"); | |
/// | |
/// Max 16 needles are supported | |
struct find_first { | |
template <char... NEEDLES> | |
size_t of(std::string_view s) const { | |
return _impl<false, NEEDLES...>(s); | |
} | |
template <char... NEEDLES> | |
size_t not_of(std::string_view s) const { | |
return _impl<true, NEEDLES...>(s); | |
} | |
template <bool SWAP_POLARITY, char... NEEDLES> | |
size_t _impl(std::string_view s) const { | |
size_t i = 0; | |
#ifdef ARROW_TOO_SIMD_FIND_FIRST_OF | |
#undef ARROW_TOO_SIMD_FIND_FIRST_OF | |
struct Batch { | |
explicit Batch(void const *data) { memcpy(&i, data, sizeof(i)); } | |
__m128i i; | |
}; | |
thread_local const Batch NEEDLES_BATCH{[] { | |
char needles_array[sizeof(__m128i)] = {NEEDLES...}; | |
// CXX(17:dcl.init.aggr#8.2) If the initializer list does not have an explicit | |
// intitializer for an element of the array, the corresponding element will be | |
// initialized from an empty initializer list. In this case that means any elements | |
// not initialized from NEEDLES will be `char{}`, aka '\0'. This would imlicitly | |
// add '\0' to every lookup set except ones with exactly 16 chars, so we overwrite | |
// that padding with the first character of NEEDLES. | |
for (size_t i = sizeof...(NEEDLES); i < sizeof(needles_array); ++i) { | |
needles_array[i] = needles_array[0]; | |
} | |
return Batch{&needles_array}; | |
}()}; | |
constexpr auto FLAGS = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK | |
| (SWAP_POLARITY ? _SIDD_NEGATIVE_POLARITY : 0); | |
for (size_t batch_count = s.size() / sizeof(Batch); batch_count > 0; --batch_count) { | |
Batch batch{s.data() + i}; | |
auto mask = static_cast<uint16_t>( | |
_mm_cvtsi128_si32(_mm_cmpistrm(NEEDLES_BATCH.i, batch.i, FLAGS))); | |
if (mask != 0) [[likely]] { | |
return i + __builtin_ctz(mask); | |
} | |
i += sizeof(Batch); | |
} | |
s = s.substr(i); | |
#endif | |
for (char c : s) { | |
auto any_match = (... or (c == NEEDLES)); | |
if constexpr (not SWAP_POLARITY) { | |
if (any_match) break; | |
// For example, NEEDLES contains a delimiter which we're seeking. | |
// If any_match is true, we've found a delimiter. | |
} else { | |
if (not any_match) break; | |
// For example, NEEDLES contains whitespace which we're skipping. | |
// If any_match is false, we've found a non-whitespace. | |
} | |
++i; | |
} | |
return i; | |
} | |
} find_first; | |
} // namespace arrow_too |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment