[ruby/json] Refactor convert_UTF8_to_JSON to split searching and escaping code

The goal is to be able to dis to more optimized search implementations without having to duplicate the escaping code. Somehow, this is a few % faster already: ``` == Encoding activitypub.json (52595 bytes) ruby 3.4.1 (2024-12-25 revision https://.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 2.257k i/100ms Calculating ------------------------------------- after 22.930k (± 1.3%) i/s (43.61 μs/i) - 115.107k in 5.020814s Comparison: before: 21604.0 i/s after: 22930.1 i/s - 1.06x faster == Encoding citm_catalog.json (500298 bytes) ruby 3.4.1 (2024-12-25 revision https://.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 137.000 i/100ms Calculating ------------------------------------- after 1.397k (± 1.1%) i/s (715.57 μs/i) - 6.987k in 5.000408s Comparison: before: 1344.4 i/s after: 1397.5 i/s - 1.04x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision https://.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 249.000 i/100ms Calculating ------------------------------------- after 2.464k (± 1.8%) i/s (405.81 μs/i) - 12.450k in 5.054131s Comparison: before: 2326.5 i/s after: 2464.2 i/s - 1.06x faster ``` https://.com/ruby/json/commit/8fb5ae807f
author: Jean Boussier <[email protected]> 2025-01-31 12:38:15 +0100
committer: Hiroshi SHIBATA <[email protected]> 2025-02-03 10:05:25 +0900
commit: 98e1c2845a8361b69820c41b05eddbe5dbf8cf58 ()
tree: 65529837959261e63887cb4f1ac7406ef4c3b59a /ext/json/generator
parent: 581d85058cf638f2f8ad87391dccc5c7708d597b (diff)
1 files changed, 175 insertions, 157 deletions
@@ -101,6 +101,7 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
 // 0 - single byte char that don't need to be escaped.
 // (x | 8) - char that needs to be escaped.
 static const unsigned char CHAR_LENGTH_MASK = 7;
 static const unsigned char escape_table[256] = {
 // ASCII Control Characters
@@ -165,6 +166,84 @@ static const unsigned char script_safe_escape_table[256] = {
 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
 };
 /* Converts in_string to a JSON string (without the wrapping '"'
 * characters) in FBuffer out_buffer.
 *
@@ -181,182 +260,114 @@ static const unsigned char script_safe_escape_table[256] = {
 * Everything else (should be UTF-8) is just passed through and
 * appended to the result.
 */
-static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
 {
- const char *hexdig = "0123456789abcdef";
- char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
-
- const char *ptr = RSTRING_PTR(str);
- unsigned long len = RSTRING_LEN(str);
-
- unsigned long beg = 0, pos = 0;
-
-#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
- while (pos < len) {
- unsigned char ch = ptr[pos];
 unsigned char ch_len = escape_table[ch];
- /* JSON encoding */
 if (RB_UNLIKELY(ch_len)) {
- switch (ch_len) {
- case 9: {
- FLUSH_POS(1);
- switch (ch) {
- case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
- case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
- case '/': fbuffer_append(out_buffer, "\\/", 2); break;
- case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
- case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
- case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
- case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
- case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
- default: {
- scratch[2] = '0';
- scratch[3] = '0';
- scratch[4] = hexdig[(ch >> 4) & 0xf];
- scratch[5] = hexdig[ch & 0xf];
- fbuffer_append(out_buffer, scratch, 6);
- break;
- }
- }
- break;
- }
- case 11: {
- unsigned char b2 = ptr[pos + 1];
- if (RB_UNLIKELY(b2 == 0x80)) {
- unsigned char b3 = ptr[pos + 2];
- if (b3 == 0xA8) {
- FLUSH_POS(3);
- fbuffer_append(out_buffer, "\\u2028", 6);
- break;
- } else if (b3 == 0xA9) {
- FLUSH_POS(3);
- fbuffer_append(out_buffer, "\\u2029", 6);
- break;
- }
- }
- ch_len = 3;
- // fallthrough
- }
- default:
- pos += ch_len;
- break;
- }
 } else {
- pos++;
 }
 }
-#undef FLUSH_POS
-
- if (beg < len) {
- fbuffer_append(out_buffer, &ptr[beg], len - beg);
- }
-
- RB_GC_GUARD(str);
 }
-static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
-{
- const char *hexdig = "0123456789abcdef";
- char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
-
- const char *ptr = RSTRING_PTR(str);
- unsigned long len = RSTRING_LEN(str);
-
- unsigned long beg = 0, pos = 0;
-
-#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
-
- while (pos < len) {
- unsigned char ch = ptr[pos];
- unsigned char ch_len = escape_table[ch];
-
- if (RB_UNLIKELY(ch_len)) {
- switch (ch_len) {
- case 9: {
- FLUSH_POS(1);
- switch (ch) {
- case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
- case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
- case '/': fbuffer_append(out_buffer, "\\/", 2); break;
- case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
- case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
- case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
- case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
- case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
- default: {
- scratch[2] = '0';
- scratch[3] = '0';
- scratch[4] = hexdig[(ch >> 4) & 0xf];
- scratch[5] = hexdig[ch & 0xf];
- fbuffer_append(out_buffer, scratch, 6);
- break;
- }
- }
 break;
 }
- default: {
- uint32_t wchar = 0;
- ch_len = ch_len & CHAR_LENGTH_MASK;
-
- switch(ch_len) {
- case 2:
- wchar = ptr[pos] & 0x1F;
- break;
- case 3:
- wchar = ptr[pos] & 0x0F;
- break;
- case 4:
- wchar = ptr[pos] & 0x07;
- break;
- }
- for (short i = 1; i < ch_len; i++) {
- wchar = (wchar << 6) | (ptr[pos+i] & 0x3F);
- }
- FLUSH_POS(ch_len);
- if (wchar <= 0xFFFF) {
- scratch[2] = hexdig[wchar >> 12];
- scratch[3] = hexdig[(wchar >> 8) & 0xf];
- scratch[4] = hexdig[(wchar >> 4) & 0xf];
- scratch[5] = hexdig[wchar & 0xf];
- fbuffer_append(out_buffer, scratch, 6);
- } else {
- uint16_t hi, lo;
- wchar -= 0x10000;
- hi = 0xD800 + (uint16_t)(wchar >> 10);
- lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
-
- scratch[2] = hexdig[hi >> 12];
- scratch[3] = hexdig[(hi >> 8) & 0xf];
- scratch[4] = hexdig[(hi >> 4) & 0xf];
- scratch[5] = hexdig[hi & 0xf];
-
- scratch[8] = hexdig[lo >> 12];
- scratch[9] = hexdig[(lo >> 8) & 0xf];
- scratch[10] = hexdig[(lo >> 4) & 0xf];
- scratch[11] = hexdig[lo & 0xf];
-
- fbuffer_append(out_buffer, scratch, 12);
- }
- break;
- }
 }
- } else {
- pos++;
 }
 }
-#undef FLUSH_POS
- if (beg < len) {
- fbuffer_append(out_buffer, &ptr[beg], len - beg);
 }
-
- RB_GC_GUARD(str);
 }
 /*
@@ -911,13 +922,20 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
 fbuffer_append_char(buffer, '"');
 switch(rb_enc_str_coderange(obj)) {
 case ENC_CODERANGE_7BIT:
 case ENC_CODERANGE_VALID:
 if (RB_UNLIKELY(state->ascii_only)) {
- convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
 } else {
- convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
 }
 break;
 default:
author	Jean Boussier <[email protected]>	2025-01-31 12:38:15 +0100
committer	Hiroshi SHIBATA <[email protected]>	2025-02-03 10:05:25 +0900
commit	98e1c2845a8361b69820c41b05eddbe5dbf8cf58 ()
tree	65529837959261e63887cb4f1ac7406ef4c3b59a /ext/json/generator
parent	581d85058cf638f2f8ad87391dccc5c7708d597b (diff)