|
1 |
| -/* auto-generated on 2024-12-17 14:54:59 -0500. Do not edit! */ |
| 1 | +/* auto-generated on 2024-12-26 12:42:33 -0500. Do not edit! */ |
2 | 2 | /* begin file src/simdutf.cpp */
|
3 | 3 | #include "simdutf.h"
|
4 | 4 | // We include base64_tables once.
|
@@ -697,6 +697,15 @@ static_assert(to_base64_url_value[uint8_t('_')] == 63,
|
697 | 697 | #include <climits>
|
698 | 698 | #include <type_traits>
|
699 | 699 |
|
| 700 | +static_assert(sizeof(uint8_t) == sizeof(char), |
| 701 | +"simdutf requires that uint8_t be a char"); |
| 702 | +static_assert(sizeof(uint16_t) == sizeof(char16_t), |
| 703 | +"simdutf requires that char16_t be 16 bits"); |
| 704 | +static_assert(sizeof(uint32_t) == sizeof(char32_t), |
| 705 | +"simdutf requires that char32_t be 32 bits"); |
| 706 | +// next line is redundant, but it is kept to catch defective systems. |
| 707 | +static_assert(CHAR_BIT == 8, "simdutf requires 8-bit bytes"); |
| 708 | + |
700 | 709 | // Useful for debugging purposes
|
701 | 710 | namespace simdutf {
|
702 | 711 | namespace {
|
@@ -9746,24 +9755,23 @@ inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
|
9746 | 9755 | }
|
9747 | 9756 |
|
9748 | 9757 | template <endianness big_endian>
|
9749 |
| -inline simdutf_warn_unused bool validate(const char16_t *buf, |
| 9758 | +inline simdutf_warn_unused bool validate(const char16_t *data, |
9750 | 9759 | size_t len) noexcept {
|
9751 |
| -const uint16_t *data = reinterpret_cast<const uint16_t *>(buf); |
9752 | 9760 | uint64_t pos = 0;
|
9753 | 9761 | while (pos < len) {
|
9754 |
| -uint16_t word = |
| 9762 | +char16_t word = |
9755 | 9763 | !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
|
9756 | 9764 | if ((word & 0xF800) == 0xD800) {
|
9757 | 9765 | if (pos + 1 >= len) {
|
9758 | 9766 | return false;
|
9759 | 9767 | }
|
9760 |
| -uint16_t diff = uint16_t(word - 0xD800); |
| 9768 | +char16_t diff = char16_t(word - 0xD800); |
9761 | 9769 | if (diff > 0x3FF) {
|
9762 | 9770 | return false;
|
9763 | 9771 | }
|
9764 |
| -uint16_t next_word = |
| 9772 | +char16_t next_word = |
9765 | 9773 | !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
|
9766 |
| -uint16_t diff2 = uint16_t(next_word - 0xDC00); |
| 9774 | +char16_t diff2 = char16_t(next_word - 0xDC00); |
9767 | 9775 | if (diff2 > 0x3FF) {
|
9768 | 9776 | return false;
|
9769 | 9777 | }
|
@@ -9776,24 +9784,23 @@ inline simdutf_warn_unused bool validate(const char16_t *buf,
|
9776 | 9784 | }
|
9777 | 9785 |
|
9778 | 9786 | template <endianness big_endian>
|
9779 |
| -inline simdutf_warn_unused result validate_with_errors(const char16_t *buf, |
| 9787 | +inline simdutf_warn_unused result validate_with_errors(const char16_t *data, |
9780 | 9788 | size_t len) noexcept {
|
9781 |
| -const uint16_t *data = reinterpret_cast<const uint16_t *>(buf); |
9782 | 9789 | size_t pos = 0;
|
9783 | 9790 | while (pos < len) {
|
9784 |
| -uint16_t word = |
| 9791 | +char16_t word = |
9785 | 9792 | !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
|
9786 | 9793 | if ((word & 0xF800) == 0xD800) {
|
9787 | 9794 | if (pos + 1 >= len) {
|
9788 | 9795 | return result(error_code::SURROGATE, pos);
|
9789 | 9796 | }
|
9790 |
| -uint16_t diff = uint16_t(word - 0xD800); |
| 9797 | +char16_t diff = char16_t(word - 0xD800); |
9791 | 9798 | if (diff > 0x3FF) {
|
9792 | 9799 | return result(error_code::SURROGATE, pos);
|
9793 | 9800 | }
|
9794 |
| -uint16_t next_word = |
| 9801 | +char16_t next_word = |
9795 | 9802 | !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
|
9796 |
| -uint16_t diff2 = uint16_t(next_word - 0xDC00); |
| 9803 | +char16_t diff2 = uint16_t(next_word - 0xDC00); |
9797 | 9804 | if (diff2 > 0x3FF) {
|
9798 | 9805 | return result(error_code::SURROGATE, pos);
|
9799 | 9806 | }
|
@@ -9806,24 +9813,22 @@ inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
|
9806 | 9813 | }
|
9807 | 9814 |
|
9808 | 9815 | template <endianness big_endian>
|
9809 |
| -inline size_t count_code_points(const char16_t *buf, size_t len) { |
| 9816 | +inline size_t count_code_points(const char16_t *p, size_t len) { |
9810 | 9817 | // We are not BOM aware.
|
9811 |
| -const uint16_t *p = reinterpret_cast<const uint16_t *>(buf); |
9812 | 9818 | size_t counter{0};
|
9813 | 9819 | for (size_t i = 0; i < len; i++) {
|
9814 |
| -uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; |
| 9820 | +char16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; |
9815 | 9821 | counter += ((word & 0xFC00) != 0xDC00);
|
9816 | 9822 | }
|
9817 | 9823 | return counter;
|
9818 | 9824 | }
|
9819 | 9825 |
|
9820 | 9826 | template <endianness big_endian>
|
9821 |
| -inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) { |
| 9827 | +inline size_t utf8_length_from_utf16(const char16_t *p, size_t len) { |
9822 | 9828 | // We are not BOM aware.
|
9823 |
| -const uint16_t *p = reinterpret_cast<const uint16_t *>(buf); |
9824 | 9829 | size_t counter{0};
|
9825 | 9830 | for (size_t i = 0; i < len; i++) {
|
9826 |
| -uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; |
| 9831 | +char16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; |
9827 | 9832 | counter++; // ASCII
|
9828 | 9833 | counter += static_cast<size_t>(
|
9829 | 9834 | word >
|
@@ -9835,25 +9840,22 @@ inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
|
9835 | 9840 | }
|
9836 | 9841 |
|
9837 | 9842 | template <endianness big_endian>
|
9838 |
| -inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) { |
| 9843 | +inline size_t utf32_length_from_utf16(const char16_t *p, size_t len) { |
9839 | 9844 | // We are not BOM aware.
|
9840 |
| -const uint16_t *p = reinterpret_cast<const uint16_t *>(buf); |
9841 | 9845 | size_t counter{0};
|
9842 | 9846 | for (size_t i = 0; i < len; i++) {
|
9843 |
| -uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; |
| 9847 | +char16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i]; |
9844 | 9848 | counter += ((word & 0xFC00) != 0xDC00);
|
9845 | 9849 | }
|
9846 | 9850 | return counter;
|
9847 | 9851 | }
|
9848 | 9852 |
|
9849 | 9853 | inline size_t latin1_length_from_utf16(size_t len) { return len; }
|
9850 | 9854 |
|
9851 |
| -simdutf_really_inline void change_endianness_utf16(const char16_t *in, |
9852 |
| -size_t size, char16_t *out) { |
9853 |
| -const uint16_t *input = reinterpret_cast<const uint16_t *>(in); |
9854 |
| -uint16_t *output = reinterpret_cast<uint16_t *>(out); |
| 9855 | +simdutf_really_inline void |
| 9856 | +change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) { |
9855 | 9857 | for (size_t i = 0; i < size; i++) {
|
9856 |
| -*output++ = uint16_t(input[i] >> 8 | input[i] << 8); |
| 9858 | +*output++ = char16_t(input[i] >> 8 | input[i] << 8); |
9857 | 9859 | }
|
9858 | 9860 | }
|
9859 | 9861 |
|
@@ -21042,6 +21044,9 @@ struct validating_transcoder {
|
21042 | 21044 | uint64_t utf8_continuation_mask =
|
21043 | 21045 | input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
21044 | 21046 | // this case, we also have ASCII to account for.
|
| 21047 | +if (utf8_continuation_mask & 1) { |
| 21048 | +return 0; // error |
| 21049 | +} |
21045 | 21050 | uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
21046 | 21051 | uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
21047 | 21052 | // We process in blocks of up to 12 bytes except possibly
|
@@ -26717,6 +26722,14 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
|
26717 | 26722 | }
|
26718 | 26723 |
|
26719 | 26724 | if (!ignore_garbage && equalsigns > 0) {
|
| 26725 | +if (last_chunk_options == last_chunk_handling_options::strict) { |
| 26726 | +return {BASE64_INPUT_REMAINDER, size_t(src - srcinit), |
| 26727 | +size_t(dst - dstinit)}; |
| 26728 | +} |
| 26729 | +if (last_chunk_options == |
| 26730 | +last_chunk_handling_options::stop_before_partial) { |
| 26731 | +return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)}; |
| 26732 | +} |
26720 | 26733 | if ((size_t(dst - dstinit) % 3 == 0) ||
|
26721 | 26734 | ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
|
26722 | 26735 | return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
|
@@ -33161,6 +33174,9 @@ struct validating_transcoder {
|
33161 | 33174 | uint64_t utf8_continuation_mask =
|
33162 | 33175 | input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
33163 | 33176 | // this case, we also have ASCII to account for.
|
| 33177 | +if (utf8_continuation_mask & 1) { |
| 33178 | +return 0; // error |
| 33179 | +} |
33164 | 33180 | uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
33165 | 33181 | uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
33166 | 33182 | // We process in blocks of up to 12 bytes except possibly
|
@@ -43013,6 +43029,9 @@ struct validating_transcoder {
|
43013 | 43029 | uint64_t utf8_continuation_mask =
|
43014 | 43030 | input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
43015 | 43031 | // this case, we also have ASCII to account for.
|
| 43032 | +if (utf8_continuation_mask & 1) { |
| 43033 | +return 0; // error |
| 43034 | +} |
43016 | 43035 | uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
43017 | 43036 | uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
43018 | 43037 | // We process in blocks of up to 12 bytes except possibly
|
@@ -48110,6 +48129,9 @@ struct validating_transcoder {
|
48110 | 48129 | uint64_t utf8_continuation_mask =
|
48111 | 48130 | input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
48112 | 48131 | // this case, we also have ASCII to account for.
|
| 48132 | +if (utf8_continuation_mask & 1) { |
| 48133 | +return 0; // error |
| 48134 | +} |
48113 | 48135 | uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
48114 | 48136 | uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
48115 | 48137 | // We process in blocks of up to 12 bytes except possibly
|
@@ -54454,6 +54476,9 @@ struct validating_transcoder {
|
54454 | 54476 | uint64_t utf8_continuation_mask =
|
54455 | 54477 | input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
|
54456 | 54478 | // this case, we also have ASCII to account for.
|
| 54479 | +if (utf8_continuation_mask & 1) { |
| 54480 | +return 0; // error |
| 54481 | +} |
54457 | 54482 | uint64_t utf8_leading_mask = ~utf8_continuation_mask;
|
54458 | 54483 | uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
|
54459 | 54484 | // We process in blocks of up to 12 bytes except possibly
|
|
0 commit comments