Skip to content

Commit c38f67a

Browse files
authored
add SSSE3 SIMD instructions (#1020)
1 parent 34a9393 commit c38f67a

File tree

3 files changed

+166
-4
lines changed

3 files changed

+166
-4
lines changed

include/ada/common_defs.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,10 @@ namespace ada {
233233
} while (0)
234234
#endif
235235

236+
#if defined(__SSSE3__)
237+
#define ADA_SSSE3 1
238+
#endif
239+
236240
#if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \
237241
(defined(_M_AMD64) || defined(_M_X64) || \
238242
(defined(_M_IX86_FP) && _M_IX86_FP == 2))

src/helpers.cpp

Lines changed: 126 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#include "ada/common_defs.h"
66
#include "ada/scheme.h"
77

8+
#if ADA_SSSE3
9+
#include <tmmintrin.h>
10+
#endif
11+
812
namespace ada::helpers {
913

1014
template <typename out_iter>
@@ -178,7 +182,64 @@ ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept {
178182
// starting at index location, this finds the next location of a character
179183
// :, /, \\, ? or [. If none is found, view.size() is returned.
180184
// For use within get_host_delimiter_location.
181-
#if ADA_NEON
185+
#if ADA_SSSE3
186+
ada_really_inline size_t find_next_host_delimiter_special(
187+
std::string_view view, size_t location) noexcept {
188+
// first check for short strings in which case we do it naively.
189+
if (view.size() - location < 16) { // slow path
190+
for (size_t i = location; i < view.size(); i++) {
191+
if (view[i] == ':' || view[i] == '/' || view[i] == '\\' ||
192+
view[i] == '?' || view[i] == '[') {
193+
return i;
194+
}
195+
}
196+
return size_t(view.size());
197+
}
198+
// fast path for long strings (expected to be common)
199+
// Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON)
200+
size_t i = location;
201+
const __m128i low_mask =
202+
_mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
203+
0x01, 0x04, 0x04, 0x00, 0x00, 0x03);
204+
const __m128i high_mask =
205+
_mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
206+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
207+
const __m128i fmask = _mm_set1_epi8(0xf);
208+
const __m128i zero = _mm_setzero_si128();
209+
for (; i + 15 < view.size(); i += 16) {
210+
__m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
211+
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
212+
__m128i highpart = _mm_shuffle_epi8(
213+
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
214+
__m128i classify = _mm_and_si128(lowpart, highpart);
215+
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
216+
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
217+
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
218+
// avoid false positives.
219+
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
220+
if (mask != 0) {
221+
return i + trailing_zeroes(static_cast<uint32_t>(mask));
222+
}
223+
}
224+
if (i < view.size()) {
225+
__m128i word =
226+
_mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
227+
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
228+
__m128i highpart = _mm_shuffle_epi8(
229+
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
230+
__m128i classify = _mm_and_si128(lowpart, highpart);
231+
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
232+
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
233+
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
234+
// avoid false positives.
235+
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
236+
if (mask != 0) {
237+
return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
238+
}
239+
}
240+
return size_t(view.size());
241+
}
242+
#elif ADA_NEON
182243
// The ada_make_uint8x16_t macro is necessary because Visual Studio does not
183244
// support direct initialization of uint8x16_t. See
184245
// https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon
@@ -417,7 +478,70 @@ ada_really_inline size_t find_next_host_delimiter_special(
417478
// starting at index location, this finds the next location of a character
418479
// :, /, ? or [. If none is found, view.size() is returned.
419480
// For use within get_host_delimiter_location.
420-
#if ADA_NEON
481+
#if ADA_SSSE3
482+
ada_really_inline size_t find_next_host_delimiter(std::string_view view,
483+
size_t location) noexcept {
484+
// first check for short strings in which case we do it naively.
485+
if (view.size() - location < 16) { // slow path
486+
for (size_t i = location; i < view.size(); i++) {
487+
if (view[i] == ':' || view[i] == '/' || view[i] == '?' ||
488+
view[i] == '[') {
489+
return i;
490+
}
491+
}
492+
return size_t(view.size());
493+
}
494+
// fast path for long strings (expected to be common)
495+
size_t i = location;
496+
// Lookup tables for bit classification:
497+
// ':' (0x3A): low[0xA]=0x01, high[0x3]=0x01 -> match
498+
// '/' (0x2F): low[0xF]=0x02, high[0x2]=0x02 -> match
499+
// '?' (0x3F): low[0xF]=0x01, high[0x3]=0x01 -> match
500+
// '[' (0x5B): low[0xB]=0x04, high[0x5]=0x04 -> match
501+
const __m128i low_mask =
502+
_mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
503+
0x01, 0x04, 0x00, 0x00, 0x00, 0x03);
504+
const __m128i high_mask =
505+
_mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00,
506+
0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
507+
const __m128i fmask = _mm_set1_epi8(0xf);
508+
const __m128i zero = _mm_setzero_si128();
509+
510+
for (; i + 15 < view.size(); i += 16) {
511+
__m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i));
512+
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
513+
__m128i highpart = _mm_shuffle_epi8(
514+
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
515+
__m128i classify = _mm_and_si128(lowpart, highpart);
516+
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
517+
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
518+
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
519+
// avoid false positives.
520+
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
521+
if (mask != 0) {
522+
return i + trailing_zeroes(static_cast<uint32_t>(mask));
523+
}
524+
}
525+
526+
if (i < view.size()) {
527+
__m128i word =
528+
_mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16));
529+
__m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask));
530+
__m128i highpart = _mm_shuffle_epi8(
531+
high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask));
532+
__m128i classify = _mm_and_si128(lowpart, highpart);
533+
__m128i is_zero = _mm_cmpeq_epi8(classify, zero);
534+
// _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31
535+
// zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to
536+
// avoid false positives.
537+
int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF;
538+
if (mask != 0) {
539+
return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask));
540+
}
541+
}
542+
return size_t(view.size());
543+
}
544+
#elif ADA_NEON
421545
ada_really_inline size_t find_next_host_delimiter(std::string_view view,
422546
size_t location) noexcept {
423547
// first check for short strings in which case we do it naively.

src/unicode.cpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@ ADA_PUSH_DISABLE_ALL_WARNINGS
1010
ADA_POP_DISABLE_WARNINGS
1111

1212
#include <algorithm>
13-
#if ADA_NEON
13+
#if ADA_SSSE3
14+
#include <tmmintrin.h>
15+
#elif ADA_NEON
1416
#include <arm_neon.h>
1517
#elif ADA_SSE2
1618
#include <emmintrin.h>
@@ -57,7 +59,39 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
5759
}
5860
return non_ascii == 0;
5961
}
60-
#if ADA_NEON
62+
#if ADA_SSSE3
63+
ada_really_inline bool has_tabs_or_newline(
64+
std::string_view user_input) noexcept {
65+
// first check for short strings in which case we do it naively.
66+
if (user_input.size() < 16) { // slow path
67+
return std::ranges::any_of(user_input, is_tabs_or_newline);
68+
}
69+
// fast path for long strings (expected to be common)
70+
// Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON)
71+
size_t i = 0;
72+
// Lookup table where positions 9, 10, 13 contain their own values
73+
// Everything else is set to 1 so it won't match
74+
const __m128i rnt =
75+
_mm_setr_epi8(1, 0, 0, 0, 0, 0, 0, 0, 0, 9, 10, 0, 0, 13, 0, 0);
76+
__m128i running = _mm_setzero_si128();
77+
for (; i + 15 < user_input.size(); i += 16) {
78+
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
79+
// Shuffle the lookup table using input bytes as indices
80+
__m128i shuffled = _mm_shuffle_epi8(rnt, word);
81+
// Compare: if shuffled value matches input, we found \t, \n, or \r
82+
__m128i matches = _mm_cmpeq_epi8(shuffled, word);
83+
running = _mm_or_si128(running, matches);
84+
}
85+
if (i < user_input.size()) {
86+
__m128i word = _mm_loadu_si128(
87+
(const __m128i*)(user_input.data() + user_input.length() - 16));
88+
__m128i shuffled = _mm_shuffle_epi8(rnt, word);
89+
__m128i matches = _mm_cmpeq_epi8(shuffled, word);
90+
running = _mm_or_si128(running, matches);
91+
}
92+
return _mm_movemask_epi8(running) != 0;
93+
}
94+
#elif ADA_NEON
6195
ada_really_inline bool has_tabs_or_newline(
6296
std::string_view user_input) noexcept {
6397
// first check for short strings in which case we do it naively.

0 commit comments

Comments
 (0)