|
5 | 5 | #include "ada/common_defs.h" |
6 | 6 | #include "ada/scheme.h" |
7 | 7 |
|
| 8 | +#if ADA_SSSE3 |
| 9 | +#include <tmmintrin.h> |
| 10 | +#endif |
| 11 | + |
8 | 12 | namespace ada::helpers { |
9 | 13 |
|
10 | 14 | template <typename out_iter> |
@@ -178,7 +182,64 @@ ada_really_inline int trailing_zeroes(uint32_t input_num) noexcept { |
178 | 182 | // starting at index location, this finds the next location of a character |
179 | 183 | // :, /, \\, ? or [. If none is found, view.size() is returned. |
180 | 184 | // For use within get_host_delimiter_location. |
181 | | -#if ADA_NEON |
| 185 | +#if ADA_SSSE3 |
| 186 | +ada_really_inline size_t find_next_host_delimiter_special( |
| 187 | + std::string_view view, size_t location) noexcept { |
| 188 | + // first check for short strings in which case we do it naively. |
| 189 | + if (view.size() - location < 16) { // slow path |
| 190 | + for (size_t i = location; i < view.size(); i++) { |
| 191 | + if (view[i] == ':' || view[i] == '/' || view[i] == '\\' || |
| 192 | + view[i] == '?' || view[i] == '[') { |
| 193 | + return i; |
| 194 | + } |
| 195 | + } |
| 196 | + return size_t(view.size()); |
| 197 | + } |
| 198 | + // fast path for long strings (expected to be common) |
| 199 | + // Using SSSE3's _mm_shuffle_epi8 for table lookup (same approach as NEON) |
| 200 | + size_t i = location; |
| 201 | + const __m128i low_mask = |
| 202 | + _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 203 | + 0x01, 0x04, 0x04, 0x00, 0x00, 0x03); |
| 204 | + const __m128i high_mask = |
| 205 | + _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, |
| 206 | + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
| 207 | + const __m128i fmask = _mm_set1_epi8(0xf); |
| 208 | + const __m128i zero = _mm_setzero_si128(); |
| 209 | + for (; i + 15 < view.size(); i += 16) { |
| 210 | + __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i)); |
| 211 | + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); |
| 212 | + __m128i highpart = _mm_shuffle_epi8( |
| 213 | + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); |
| 214 | + __m128i classify = _mm_and_si128(lowpart, highpart); |
| 215 | + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); |
| 216 | + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 |
| 217 | + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to |
| 218 | + // avoid false positives. |
| 219 | + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; |
| 220 | + if (mask != 0) { |
| 221 | + return i + trailing_zeroes(static_cast<uint32_t>(mask)); |
| 222 | + } |
| 223 | + } |
| 224 | + if (i < view.size()) { |
| 225 | + __m128i word = |
| 226 | + _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16)); |
| 227 | + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); |
| 228 | + __m128i highpart = _mm_shuffle_epi8( |
| 229 | + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); |
| 230 | + __m128i classify = _mm_and_si128(lowpart, highpart); |
| 231 | + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); |
| 232 | + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 |
| 233 | + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to |
| 234 | + // avoid false positives. |
| 235 | + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; |
| 236 | + if (mask != 0) { |
| 237 | + return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask)); |
| 238 | + } |
| 239 | + } |
| 240 | + return size_t(view.size()); |
| 241 | +} |
| 242 | +#elif ADA_NEON |
182 | 243 | // The ada_make_uint8x16_t macro is necessary because Visual Studio does not |
183 | 244 | // support direct initialization of uint8x16_t. See |
184 | 245 | // https://developercommunity.visualstudio.com/t/error-C2078:-too-many-initializers-whe/402911?q=backend+neon |
@@ -417,7 +478,70 @@ ada_really_inline size_t find_next_host_delimiter_special( |
417 | 478 | // starting at index location, this finds the next location of a character |
418 | 479 | // :, /, ? or [. If none is found, view.size() is returned. |
419 | 480 | // For use within get_host_delimiter_location. |
420 | | -#if ADA_NEON |
| 481 | +#if ADA_SSSE3 |
| 482 | +ada_really_inline size_t find_next_host_delimiter(std::string_view view, |
| 483 | + size_t location) noexcept { |
| 484 | + // first check for short strings in which case we do it naively. |
| 485 | + if (view.size() - location < 16) { // slow path |
| 486 | + for (size_t i = location; i < view.size(); i++) { |
| 487 | + if (view[i] == ':' || view[i] == '/' || view[i] == '?' || |
| 488 | + view[i] == '[') { |
| 489 | + return i; |
| 490 | + } |
| 491 | + } |
| 492 | + return size_t(view.size()); |
| 493 | + } |
| 494 | + // fast path for long strings (expected to be common) |
| 495 | + size_t i = location; |
| 496 | + // Lookup tables for bit classification: |
| 497 | + // ':' (0x3A): low[0xA]=0x01, high[0x3]=0x01 -> match |
| 498 | + // '/' (0x2F): low[0xF]=0x02, high[0x2]=0x02 -> match |
| 499 | + // '?' (0x3F): low[0xF]=0x01, high[0x3]=0x01 -> match |
| 500 | + // '[' (0x5B): low[0xB]=0x04, high[0x5]=0x04 -> match |
| 501 | + const __m128i low_mask = |
| 502 | + _mm_setr_epi8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 503 | + 0x01, 0x04, 0x00, 0x00, 0x00, 0x03); |
| 504 | + const __m128i high_mask = |
| 505 | + _mm_setr_epi8(0x00, 0x00, 0x02, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, |
| 506 | + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
| 507 | + const __m128i fmask = _mm_set1_epi8(0xf); |
| 508 | + const __m128i zero = _mm_setzero_si128(); |
| 509 | + |
| 510 | + for (; i + 15 < view.size(); i += 16) { |
| 511 | + __m128i word = _mm_loadu_si128((const __m128i*)(view.data() + i)); |
| 512 | + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); |
| 513 | + __m128i highpart = _mm_shuffle_epi8( |
| 514 | + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); |
| 515 | + __m128i classify = _mm_and_si128(lowpart, highpart); |
| 516 | + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); |
| 517 | + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 |
| 518 | + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to |
| 519 | + // avoid false positives. |
| 520 | + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; |
| 521 | + if (mask != 0) { |
| 522 | + return i + trailing_zeroes(static_cast<uint32_t>(mask)); |
| 523 | + } |
| 524 | + } |
| 525 | + |
| 526 | + if (i < view.size()) { |
| 527 | + __m128i word = |
| 528 | + _mm_loadu_si128((const __m128i*)(view.data() + view.length() - 16)); |
| 529 | + __m128i lowpart = _mm_shuffle_epi8(low_mask, _mm_and_si128(word, fmask)); |
| 530 | + __m128i highpart = _mm_shuffle_epi8( |
| 531 | + high_mask, _mm_and_si128(_mm_srli_epi16(word, 4), fmask)); |
| 532 | + __m128i classify = _mm_and_si128(lowpart, highpart); |
| 533 | + __m128i is_zero = _mm_cmpeq_epi8(classify, zero); |
| 534 | + // _mm_movemask_epi8 returns a 16-bit mask in bits 0-15, with bits 16-31 |
| 535 | + // zero. After NOT (~), bits 16-31 become 1. We must mask to 16 bits to |
| 536 | + // avoid false positives. |
| 537 | + int mask = ~_mm_movemask_epi8(is_zero) & 0xFFFF; |
| 538 | + if (mask != 0) { |
| 539 | + return view.length() - 16 + trailing_zeroes(static_cast<uint32_t>(mask)); |
| 540 | + } |
| 541 | + } |
| 542 | + return size_t(view.size()); |
| 543 | +} |
| 544 | +#elif ADA_NEON |
421 | 545 | ada_really_inline size_t find_next_host_delimiter(std::string_view view, |
422 | 546 | size_t location) noexcept { |
423 | 547 | // first check for short strings in which case we do it naively. |
|
0 commit comments