summaryrefslogtreecommitdiff
path: root/ext/json
diff options
context:
space:
mode:
authorJean Boussier <[email protected]>2025-01-16 18:31:18 +0100
committerHiroshi SHIBATA <[email protected]>2025-01-20 16:09:00 +0900
commitdf8f93848e46bf19b7b548fd30e033728a5f728f ()
treeb27e958bfe8bc6b5cc2b69453cb2fb4767f508cf /ext/json
parentba8f22c040bcc1efdce30ba4110e9fc261345bd5 (diff)
[ruby/json] json_string_unescape: use memchr to search for backslashes
https://.com/ruby/json/commit/5e6cfcf724
Notes: Merged: https://.com/ruby/ruby/pull/12598
-rw-r--r--ext/json/parser/parser.c151
1 files changed, 71 insertions, 80 deletions
@@ -592,96 +592,87 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
}
}
- pe = memchr(p, '\\', bufferSize);
- if (RB_UNLIKELY(pe == NULL)) {
- return build_string(string, stringEnd, intern, symbolize);
- }
-
VALUE result = rb_str_buf_new(bufferSize);
rb_enc_associate_index(result, utf8_encindex);
buffer = RSTRING_PTR(result);
bufferStart = buffer;
- while (pe < stringEnd) {
- if (*pe == '\\') {
- unescape = (char *) "?";
- unescape_len = 1;
- if (pe > p) {
- MEMCPY(buffer, p, char, pe - p);
- buffer += pe - p;
- }
- switch (*++pe) {
- case 'n':
- unescape = (char *) "\n";
- break;
- case 'r':
- unescape = (char *) "\r";
- break;
- case 't':
- unescape = (char *) "\t";
- break;
- case '"':
- unescape = (char *) "\"";
- break;
- case '\\':
- unescape = (char *) "\\";
- break;
- case 'b':
- unescape = (char *) "\b";
- break;
- case 'f':
- unescape = (char *) "\f";
- break;
- case 'u':
- if (pe > stringEnd - 4) {
- raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
- } else {
- uint32_t ch = unescape_unicode((unsigned char *) ++pe);
- pe += 3;
- /* To handle values above U+FFFF, we take a sequence of
- * \uXXXX escapes in the U+D800..U+DBFF then
- * U+DC00..U+DFFF ranges, take the low 10 bits from each
- * to make a 20-bit number, then add 0x10000 to get the
- * final codepoint.
- *
- * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
- * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
- * Area".
- */
- if ((ch & 0xFC00) == 0xD800) {
- pe++;
- if (pe > stringEnd - 6) {
- raise_parse_error("incomplete surrogate pair at '%s'", p);
- }
- if (pe[0] == '\\' && pe[1] == 'u') {
- uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
- ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
- | (sur & 0x3FF));
- pe += 5;
- } else {
- unescape = (char *) "?";
- break;
- }
}
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
- unescape = buf;
}
- break;
- default:
- p = pe;
- continue;
- }
- MEMCPY(buffer, unescape, char, unescape_len);
- buffer += unescape_len;
- p = ++pe;
- } else {
- pe++;
}
}
- if (pe > p) {
- MEMCPY(buffer, p, char, pe - p);
- buffer += pe - p;
}
rb_str_set_len(result, buffer - bufferStart);