summaryrefslogtreecommitdiff
path: root/regparse.c
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-12-05 08:10:24 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-12-05 08:10:24 +0000
commit66a6073859ac6ae2143a9d72162efedece7e1348 ()
tree748caa7361d513e33ea1530958a1c08894be969e /regparse.c
parent78be4478d176f9dc6f2e4fd8a5daa90173458568 (diff)
update to Unicode 11.0.0 (main step, not complete yet)
- common.mk: Change Unicode version to 11.0.0, and Emoji version to 11.0 - test/ruby/enc/test_emoji_breaks.rb: update hard-coded Emoji version - enc/unicode/11.0.0, enc/unicode/11.0.0/casefold.h, enc/unicode/name2ctype.h: Add generated files. Files for Unicode 10.0.0 will be removed once we are sure 11.0.0 works. - lib/unicode_normalize/tables.rb: Updated table. - regparse.c: Almost completely reimplement grapheme cluster detection in function node_extended_grapheme_cluster(). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66213 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--regparse.c483
1 files changed, 174 insertions, 309 deletions
@@ -5831,6 +5831,7 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
Node* list2 = NULL;
Node* alt = NULL;
Node* alt2 = NULL;
BBuf *pbuf1 = NULL;
int r = 0;
int num1;
@@ -5845,9 +5846,9 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
if (ONIGENC_IS_UNICODE(env->enc)) {
/* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
CClassNode* cc;
- OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80;
- Node **seq = node_array; /* seq[5] */
- Node **alts = node_array+5; /* alts[4] */
for (i=0; i<NODE_ARRAY_SIZE; i++)
node_array[i] = NULL_NODE;
@@ -5857,320 +5858,183 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
* order the various expressions appear in the grammar)
* in the old-style parts. It is forwards in the new-style
* parts (in blocks ending with create_sequence_node()). */
- /* Unicode 10.0.0 */
- /* CRLF
- * | Prepend*
- * ( RI-sequence | Hangul-Syllable | !Control )
- * ( Grapheme_Extend | SpacingMark )*
- * | . */
-
- /* Unicode 10.0.0 */
- /* ( Grapheme_Extend | SpacingMark )* */
- R_ERR(create_property_node(&np1, env, "Grapheme_Cluster_Break=Extend"));
-
- cc = NCCLASS(np1);
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=SpacingMark", 0, env));
- R_ERR(add_code_range(&(cc->mbuf), env, 0x200D, 0x200D));
-
- R_ERR(quantify_node(&np1, 0, REPEAT_INFINITE));
-
- tmp = node_new_list(np1, NULL_NODE);
- if (IS_NULL(tmp)) goto err;
- list = tmp;
- np1 = NULL;
-
- /* Unicode 10.0.0 */
- /* ( RI-sequence | Hangul-Syllable | !Control ) */
- /* !Control */
- np1 = node_new_cclass();
- if (IS_NULL(np1)) goto err;
- cc = NCCLASS(np1);
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Control", 1, env));
- if (! (ONIGENC_MBC_MINLEN(env->enc) > 1)) {
- BITSET_CLEAR_BIT(cc->bs, 0x0a);
- BITSET_CLEAR_BIT(cc->bs, 0x0d);
- }
-
- tmp = onig_node_new_alt(np1, NULL_NODE);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- np1 = NULL;
-
- /* Unicode 10.0.0 */
- /* Hangul-Syllable
- * := L* V+ T*
- * | L* LV V* T*
- * | L* LVT T*
- * | L+
- * | T+ */
- /* Unicode 11.0.0 */
- /* Hangul-Syllable
- * := L* (V+ | LV V* | LVT) T*
- * | L+
- * | T+ */
- /* these are equivalent, so we leave things as is for the moment */
-
- /* T+ */
- R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=T", '+'));
-
- tmp = onig_node_new_alt(np1, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- np1 = NULL;
-
- /* L+ */
- R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=L", '+'));
-
- tmp = onig_node_new_alt(np1, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- np1 = NULL;
-
- /* L* LVT T* */
- {
- R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
- R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LVT"));
- R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*'));
-
- R_ERR(create_sequence_node(&list2, seq));
- }
-
- tmp = onig_node_new_alt(list2, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list2 = NULL;
-
- /* L* LV V* T* */
- {
- R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
- R_ERR(create_property_node(seq+1, env, "Grapheme_Cluster_Break=LV"));
- R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=V", '*'));
- R_ERR(quantify_property_node(seq+3, env, "Grapheme_Cluster_Break=T", '*'));
-
- R_ERR(create_sequence_node(&list2, seq));
- }
-
- tmp = onig_node_new_alt(list2, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list2 = NULL;
-
- /* L* V+ T* */
- {
- R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=L", '*'));
- R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=V", '+'));
- R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=T", '*'));
-
- R_ERR(create_sequence_node(&list2, seq));
- }
-
- tmp = onig_node_new_alt(list2, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list2 = NULL;
- /* end of Hangul-Syllable */
-
- /* Unicode 10.0.0 */
- /* Emoji sequence := (E_Base | EBG) Extend* E_Modifier?
- * (ZWJ (Glue_After_Zwj | EBG Extend* E_Modifier?) )* */
- /* ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?) */
{
- /* Unicode 10.0.0 */
- /* Emoji variation sequence
- * http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt
- */
- /* Emoji U+FE0F */
- {
- seq[0] = node_new_cclass();
- if (IS_NULL(seq[0])) goto err;
- cc = NCCLASS(seq[0]);
- R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_Emoji));
- r = ONIGENC_CODE_TO_MBC(env->enc, 0xfe0f, buf); /* VARIATION SELECTOR-16 */
- if (r < 0) goto err;
- seq[1] = node_new_str_raw(buf, buf + r);
- if (IS_NULL(seq[1])) goto err;
- R_ERR(quantify_node(seq+1, 0, 1));
-
- R_ERR(create_sequence_node(alts+0, seq));
}
- /* Unicode 10.0.0 */
- /* Glue_After_Zwj */
{
- seq[0] = node_new_cclass();
- if (IS_NULL(seq[0])) goto err;
- cc = NCCLASS(seq[0]);
- R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_GAZ));
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env));
- R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*'));
R_ERR(create_sequence_node(alts+1, seq));
}
- /* E_Base_GAZ Extend* E_Modifier? */
- {
- R_ERR(create_property_node(seq+0, env, "Grapheme_Cluster_Break=E_Base_GAZ"));
- R_ERR(quantify_property_node(seq+1, env, "Grapheme_Cluster_Break=Extend", '*'));
- R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
-
- R_ERR(create_sequence_node(alts+2, seq));
- }
-
- R_ERR(create_alternate_node(&alt2, alts));
- }
-
- tmp = node_new_list(alt2, NULL_NODE);
- if (IS_NULL(tmp)) goto err;
- list2 = tmp;
- alt2 = NULL;
-
- /* ZWJ */
- r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */
- if (r < 0) goto err;
- np1 = node_new_str_raw(buf, buf + r);
- if (IS_NULL(np1)) goto err;
-
- tmp = node_new_list(np1, list2);
- if (IS_NULL(tmp)) goto err;
- list2 = tmp;
- np1 = NULL;
-
- R_ERR(quantify_node(&list2, 0, REPEAT_INFINITE));
- np1 = list2;
- list2 = NULL;
-
- tmp = node_new_list(np1, NULL_NODE);
- if (IS_NULL(tmp)) goto err;
- list2 = tmp;
- np1 = NULL;
-
- /* E_Modifier? */
- R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
-
- tmp = node_new_list(np1, list2);
- if (IS_NULL(tmp)) goto err;
- list2 = tmp;
- np1 = NULL;
-
- /* Extend* */
- R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Extend", '*'));
-
- tmp = node_new_list(np1, list2);
- if (IS_NULL(tmp)) goto err;
- list2 = tmp;
- np1 = NULL;
-
- /* (E_Base | EBG) */
- np1 = node_new_cclass();
- if (IS_NULL(np1)) goto err;
- cc = NCCLASS(np1);
- R_ERR(add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, onigenc_unicode_GCB_ranges_E_Base));
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base", 0, env));
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env));
-
- tmp = node_new_list(np1, list2);
- if (IS_NULL(tmp)) goto err;
- list2 = tmp;
- np1 = NULL;
-
- tmp = onig_node_new_alt(list2, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list2 = NULL;
-
- /* Unicode 10.0.0 */
- /* a sequence starting with ZWJ seems artificial, but GraphemeBreakTest
- * has such examples.
- * http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakTest.html
- */
- /* ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
- {
- r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* ZERO WIDTH JOINER (ZWJ) */
- if (r < 0) goto err;
- seq[0] = node_new_str_raw(buf, buf + r);
- if (IS_NULL(seq[0])) goto err;
-
- seq[1] = node_new_cclass();
- if (IS_NULL(seq[1])) goto err;
- cc = NCCLASS(seq[1]);
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=Glue_After_Zwj", 0, env));
- R_ERR(add_property_to_cc(cc, "Grapheme_Cluster_Break=E_Base_GAZ", 0, env));
-
- R_ERR(quantify_property_node(seq+2, env, "Grapheme_Cluster_Break=E_Modifier", '?'));
-
- R_ERR(create_sequence_node(&list2, seq));
- } /* End of ZWJ (E_Base_GAZ | Glue_After_Zwj) E_Modifier? */
-
- tmp = onig_node_new_alt(list2, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list2 = NULL;
-
- /* Unicode 10.0.0/11.0.0 */
- /* this is Regional_Indicator+ in the Unicode 10.0.0 regular expression,
- * but the segmentation rules and Unicode 11.0.0 use Regional_Indicator{2}, so no need to fix */
- /* RI-Sequence := Regional_Indicator{2} */
- R_ERR(quantify_property_node(&np1, env, "Regional_Indicator", '2'));
-
- tmp = onig_node_new_alt(np1, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- np1 = NULL;
- tmp = node_new_list(alt, list);
- if (IS_NULL(tmp)) goto err;
- list = tmp;
- alt = NULL;
-
- /* Prepend* */
- R_ERR(quantify_property_node(&np1, env, "Grapheme_Cluster_Break=Prepend", '*'));
-
- tmp = node_new_list(np1, list);
- if (IS_NULL(tmp)) goto err;
- list = tmp;
- np1 = NULL;
-
- /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */
- np1 = node_new_anychar();
- if (IS_NULL(np1)) goto err;
-
- option = env->option;
- ONOFF(option, ONIG_OPTION_MULTILINE, 0);
- tmp = node_new_option(option);
- if (IS_NULL(tmp)) goto err;
- NENCLOSE(tmp)->target = np1;
- np1 = tmp;
-
- tmp = onig_node_new_alt(np1, NULL_NODE);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- np1 = NULL;
-
- /* Prepend+ ZWJ* */
- {
- R_ERR(quantify_property_node(seq+0, env, "Grapheme_Cluster_Break=Prepend", '+'));
- r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); /* does this belong to Prepend?? */
- if (r < 0) goto err;
- seq[1] = node_new_str_raw(buf, buf + r);
- if (IS_NULL(seq[1])) goto err;
- R_ERR(quantify_node(seq+1, 0, 1));
-
- R_ERR(create_sequence_node(&list2, seq));
- }
-
- tmp = onig_node_new_alt(list2, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list2 = NULL;
-
- tmp = onig_node_new_alt(list, alt);
- if (IS_NULL(tmp)) goto err;
- alt = tmp;
- list = NULL;
}
else
#endif /* USE_UNICODE_PROPERTIES */
@@ -6186,11 +6050,12 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
NENCLOSE(tmp)->target = np1;
np1 = tmp;
- alt = onig_node_new_alt(np1, NULL_NODE);
- if (IS_NULL(alt)) goto err;
np1 = NULL;
}
/* \x0D\x0A */
r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf);
if (r < 0) goto err;
@@ -6200,15 +6065,15 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env)
np1 = node_new_str_raw(buf, buf + num1 + r);
if (IS_NULL(np1)) goto err;
- tmp = onig_node_new_alt(np1, alt);
if (IS_NULL(tmp)) goto err;
- alt = tmp;
np1 = NULL;
/* (?>\x0D\x0A|...) */
tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK);
if (IS_NULL(tmp)) goto err;
- NENCLOSE(tmp)->target = alt;
np1 = tmp;
#ifdef USE_UNICODE_PROPERTIES