diff options
author | Jean Boussier <[email protected]> | 2025-01-31 12:38:15 +0100 |
---|---|---|
committer | Hiroshi SHIBATA <[email protected]> | 2025-02-03 10:05:25 +0900 |
commit | 98e1c2845a8361b69820c41b05eddbe5dbf8cf58 () | |
tree | 65529837959261e63887cb4f1ac7406ef4c3b59a /ext/json/generator | |
parent | 581d85058cf638f2f8ad87391dccc5c7708d597b (diff) |
[ruby/json] Refactor convert_UTF8_to_JSON to split searching and escaping code
The goal is to be able to dis to more optimized search implementations without having to duplicate the escaping code. Somehow, this is a few % faster already: ``` == Encoding activitypub.json (52595 bytes) ruby 3.4.1 (2024-12-25 revision https://.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 2.257k i/100ms Calculating ------------------------------------- after 22.930k (± 1.3%) i/s (43.61 μs/i) - 115.107k in 5.020814s Comparison: before: 21604.0 i/s after: 22930.1 i/s - 1.06x faster == Encoding citm_catalog.json (500298 bytes) ruby 3.4.1 (2024-12-25 revision https://.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 137.000 i/100ms Calculating ------------------------------------- after 1.397k (± 1.1%) i/s (715.57 μs/i) - 6.987k in 5.000408s Comparison: before: 1344.4 i/s after: 1397.5 i/s - 1.04x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision https://.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 249.000 i/100ms Calculating ------------------------------------- after 2.464k (± 1.8%) i/s (405.81 μs/i) - 12.450k in 5.054131s Comparison: before: 2326.5 i/s after: 2464.2 i/s - 1.06x faster ``` https://.com/ruby/json/commit/8fb5ae807f
-rw-r--r-- | ext/json/generator/generator.c | 332 |
1 files changed, 175 insertions, 157 deletions
@@ -101,6 +101,7 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...) // 0 - single byte char that don't need to be escaped. // (x | 8) - char that needs to be escaped. static const unsigned char CHAR_LENGTH_MASK = 7; static const unsigned char escape_table[256] = { // ASCII Control Characters @@ -165,6 +166,84 @@ static const unsigned char script_safe_escape_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9, }; /* Converts in_string to a JSON string (without the wrapping '"' * characters) in FBuffer out_buffer. * @@ -181,182 +260,114 @@ static const unsigned char script_safe_escape_table[256] = { * Everything else (should be UTF-8) is just passed through and * appended to the result. */ -static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) { - const char *hexdig = "0123456789abcdef"; - char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; - - const char *ptr = RSTRING_PTR(str); - unsigned long len = RSTRING_LEN(str); - - unsigned long beg = 0, pos = 0; - -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; - while (pos < len) { - unsigned char ch = ptr[pos]; unsigned char ch_len = escape_table[ch]; - /* JSON encoding */ if (RB_UNLIKELY(ch_len)) { - switch (ch_len) { - case 9: { - FLUSH_POS(1); - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: { - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - break; - } - } - break; - } - case 11: { - unsigned char b2 = ptr[pos + 1]; - if (RB_UNLIKELY(b2 == 0x80)) { - unsigned char b3 = ptr[pos + 2]; - if (b3 == 0xA8) { - FLUSH_POS(3); - fbuffer_append(out_buffer, "\\u2028", 6); - break; - } else if (b3 == 0xA9) { - FLUSH_POS(3); - fbuffer_append(out_buffer, "\\u2029", 6); - break; - } - } - ch_len = 3; - // fallthrough - } - default: - pos += ch_len; - break; - } } else { - pos++; } } -#undef FLUSH_POS - - if (beg < len) { - fbuffer_append(out_buffer, &ptr[beg], len - beg); - } - - RB_GC_GUARD(str); } -static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256]) -{ - const char *hexdig = "0123456789abcdef"; - char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; - - const char *ptr = RSTRING_PTR(str); - unsigned long len = RSTRING_LEN(str); - - unsigned long beg = 0, pos = 0; - -#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; - - while (pos < len) { - unsigned char ch = ptr[pos]; - unsigned char ch_len = escape_table[ch]; - - if (RB_UNLIKELY(ch_len)) { - switch (ch_len) { - case 9: { - FLUSH_POS(1); - switch (ch) { - case '"': fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': fbuffer_append(out_buffer, "\\t", 2); break; - default: { - scratch[2] = '0'; - scratch[3] = '0'; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - break; - } - } break; } - default: { - uint32_t wchar = 0; - ch_len = ch_len & CHAR_LENGTH_MASK; - - switch(ch_len) { - case 2: - wchar = ptr[pos] & 0x1F; - break; - case 3: - wchar = ptr[pos] & 0x0F; - break; - case 4: - wchar = ptr[pos] & 0x07; - break; - } - for (short i = 1; i < ch_len; i++) { - wchar = (wchar << 6) | (ptr[pos+i] & 0x3F); - } - FLUSH_POS(ch_len); - if (wchar <= 0xFFFF) { - scratch[2] = hexdig[wchar >> 12]; - scratch[3] = hexdig[(wchar >> 8) & 0xf]; - scratch[4] = hexdig[(wchar >> 4) & 0xf]; - scratch[5] = hexdig[wchar & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - } else { - uint16_t hi, lo; - wchar -= 0x10000; - hi = 0xD800 + (uint16_t)(wchar >> 10); - lo = 0xDC00 + (uint16_t)(wchar & 0x3FF); - - scratch[2] = hexdig[hi >> 12]; - scratch[3] = hexdig[(hi >> 8) & 0xf]; - scratch[4] = hexdig[(hi >> 4) & 0xf]; - scratch[5] = hexdig[hi & 0xf]; - - scratch[8] = hexdig[lo >> 12]; - scratch[9] = hexdig[(lo >> 8) & 0xf]; - scratch[10] = hexdig[(lo >> 4) & 0xf]; - scratch[11] = hexdig[lo & 0xf]; - - fbuffer_append(out_buffer, scratch, 12); - } - break; - } } - } else { - pos++; } } -#undef FLUSH_POS - if (beg < len) { - fbuffer_append(out_buffer, &ptr[beg], len - beg); } - - RB_GC_GUARD(str); } /* @@ -911,13 +922,20 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat fbuffer_append_char(buffer, '"'); switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: case ENC_CODERANGE_VALID: if (RB_UNLIKELY(state->ascii_only)) { - convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); } else { - convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); } break; default: |