diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2016-12-10 17:47:04 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2016-12-10 17:47:04 +0000 |
commit | 2873edeafb6f6df1fc99bb9b1167591b99dd378c () | |
tree | deb58ca3dc6d6cd71c1740e62aa7e47bea5ed37e /enc | |
parent | 42a677c895f82bcd611db2773fbe68b0558b142d (diff) |
Merge Onigmo 6.0.0
* https://.com/k-takata/Onigmo/blob/Onigmo-6.0.0/HISTORY * fix for ruby 2.4: https://.com/k-takata/Onigmo/pull/78 * suppress warning: https://.com/k-takata/Onigmo/pull/79 * include/ruby/oniguruma.h: include onigmo.h. * template/encdb.h.tmpl: ignore duplicated definition of EUC-CN in enc/euc_kr.c. It is defined in enc/gb2313.c with CRuby macro. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@57045 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
42 files changed, 593 insertions, 527 deletions
@@ -29,9 +29,12 @@ */ #include "regenc.h" -#include "encindex.h" #ifndef ENCINDEX_ASCII -#define ENCINDEX_ASCII 0 #endif OnigEncodingDefine(ascii, ASCII) = { @@ -51,9 +54,9 @@ OnigEncodingDefine(ascii, ASCII) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, ENCINDEX_ASCII, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("BINARY", "ASCII-8BIT") ENC_REPLICATE("IBM437", "ASCII-8BIT") @@ -300,9 +300,9 @@ OnigEncodingDefine(big5, BIG5) = { onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, big5_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* @@ -335,9 +335,9 @@ OnigEncodingDefine(big5_hkscs, BIG5_HKSCS) = { onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, big5_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_ALIAS("Big5-HKSCS:2008", "Big5-HKSCS") @@ -370,7 +370,7 @@ OnigEncodingDefine(big5_uao, BIG5_UAO) = { onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, big5_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; @@ -211,9 +211,9 @@ OnigEncodingDefine(cp949, CP949) = { onigenc_not_support_get_ctype_code_range, cp949_left_adjust_char_head, cp949_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: CP949 @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#include "regint.h" #define emacsmule_islead(c) ((UChar )(c) < 0x9e) @@ -334,9 +334,9 @@ OnigEncodingDefine(emacs_mule, Emacs_Mule) = { onigenc_not_support_get_ctype_code_range, left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_REPLICATE("stateless-ISO-2022-JP", "Emacs-Mule") @@ -28,7 +28,7 @@ * SUCH DAMAGE. */ -#include "regint.h" #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) @@ -576,9 +576,9 @@ OnigEncodingDefine(euc_jp, EUC_JP) = { get_ctype_code_range, left_adjust_char_head, is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: EUC-JP @@ -188,8 +188,33 @@ OnigEncodingDefine(euc_kr, EUC_KR) = { onigenc_not_support_get_ctype_code_range, euckr_left_adjust_char_head, euckr_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_ALIAS("eucKR", "EUC-KR") @@ -221,8 +221,8 @@ OnigEncodingDefine(euc_tw, EUC_TW) = { onigenc_not_support_get_ctype_code_range, euctw_left_adjust_char_head, euctw_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_ALIAS("eucTW", "EUC-TW") @@ -597,8 +597,7 @@ OnigEncodingDefine(gb18030, GB18030) = { onigenc_not_support_get_ctype_code_range, gb18030_left_adjust_char_head, gb18030_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; - @@ -211,9 +211,9 @@ OnigEncodingDefine(gbk, GBK) = { onigenc_not_support_get_ctype_code_range, gbk_left_adjust_char_head, gbk_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: GBK @@ -256,45 +256,46 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSE } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncISO_8859_1_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code += 0x20; } - else if (code==0xAA || code==0xBA || code==0xB5 || code==0xFF) ; - else if ((EncISO_8859_1_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_1, ISO_8859_1) = { @@ -314,8 +315,8 @@ OnigEncodingDefine(iso_8859_1, ISO_8859_1) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-1", "ISO-8859-1") @@ -215,9 +215,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -225,48 +225,49 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } - else if (code==0xBD || code==0xFF) ; else if ((EncISO_8859_10_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_10_TO_LOWER_CASE(code); } - else if ((EncISO_8859_10_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code>=0xA0 && code<=0xBF) code -= 0x10; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_10, ISO_8859_10) = { @@ -286,8 +287,8 @@ OnigEncodingDefine(iso_8859_10, ISO_8859_10) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-10", "ISO-8859-10") @@ -93,9 +93,9 @@ OnigEncodingDefine(iso_8859_11, ISO_8859_11) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ISO8859-11", "ISO-8859-11") @@ -208,9 +208,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -218,38 +218,39 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncISO_8859_13_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_13_TO_LOWER_CASE(code); } - else if (code==0xB5) ; - else if ((EncISO_8859_13_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0xB8 || code==0xBA || code==0xBF) { code -= 0x10; } else { @@ -257,11 +258,11 @@ case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_13, ISO_8859_13) = { @@ -281,8 +282,8 @@ OnigEncodingDefine(iso_8859_13, ISO_8859_13) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-13", "ISO-8859-13") @@ -217,9 +217,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -227,58 +227,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } - /* else if (code==0xAA || code==0xBA) ; */ else if ((EncISO_8859_14_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_14_TO_LOWER_CASE(code); } - else if ((EncISO_8859_14_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if(code == 0xA2 || code == 0xA5 || code == 0xB1 || code == 0xB3 || code == 0xB5 || code == 0xBE) code -= 0x1; - else if(code == 0xAB) code -= 0x5; - else if(code == 0xFF) code -= 0x50; - else if(code == 0xB9) code -= 0x2; - else if(code == 0xBF) code -= 0x4; - else if(code == 0xB8 || code == 0xBA || code == 0xBC) code -= 0x10; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_14, ISO_8859_14) = { @@ -298,8 +298,8 @@ OnigEncodingDefine(iso_8859_14, ISO_8859_14) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-14", "ISO-8859-14") @@ -211,9 +211,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -221,54 +221,55 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } - else if (code==0xAA || code==0xBA || code==0xB5) ; else if ((EncISO_8859_15_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_15_TO_LOWER_CASE(code); } - else if ((EncISO_8859_15_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0xA8) code -= 2; - else if (code==0xB8) code -= 4; - else if (code==0xBD) code -= 1; - else if (code==0xFF) code -= 0x41; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_15, ISO_8859_15) = { @@ -288,8 +289,8 @@ OnigEncodingDefine(iso_8859_15, ISO_8859_15) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-15", "ISO-8859-15") @@ -213,9 +213,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -223,57 +223,57 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncISO_8859_16_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_16_TO_LOWER_CASE(code); } - else if ((EncISO_8859_16_CtypeTable[code]&BIT_CTYPE_LOWER) && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0xA2 || code==0xBD) code--; - else if (code==0xB3 || code==0xBA || code==0xBF) code -= 0x10; - else if (code==0xA8 || code==0xAE) code -= 0x02; - else if (code==0xB9) code -= 0x07; - else if (code==0xB8) code -= 0x04; - else if (code==0xFF) code -= 0x41; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_16, ISO_8859_16) = { @@ -293,8 +293,8 @@ OnigEncodingDefine(iso_8859_16, ISO_8859_16) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-16", "ISO-8859-16") @@ -221,50 +221,50 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSE } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncISO_8859_2_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_2_TO_LOWER_CASE(code); } - else if ((EncISO_8859_2_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { - if (code>=0xB1 && code<=0xBF){ flags |= ONIGENC_CASE_MODIFIED; code -= 0x10; } - else{ flags |= ONIGENC_CASE_MODIFIED; code -= 0x20; } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_2, ISO_8859_2) = { @@ -284,8 +284,8 @@ OnigEncodingDefine(iso_8859_2, ISO_8859_2) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-2", "ISO-8859-2") @@ -223,45 +223,46 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, #define DOTLESS_i (0xB9) #define I_WITH_DOT_ABOVE (0xA9) static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } - else if (code==0xB5) ; else if ((EncISO_8859_3_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='I') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? DOTLESS_i : 'i'; else code = ENC_ISO_8859_3_TO_LOWER_CASE(code); } else if ((EncISO_8859_3_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='i') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? I_WITH_DOT_ABOVE : 'I'; - else if (code==DOTLESS_i) code = 'I'; - else if (code>=0xB0 && code<=0xBF ) { code -= 0x10; } else { @@ -269,11 +270,11 @@ case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_3, ISO_8859_3) = { @@ -293,8 +294,8 @@ OnigEncodingDefine(iso_8859_3, ISO_8859_3) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-3", "ISO-8859-3") @@ -232,31 +232,32 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncISO_8859_4_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_4_TO_LOWER_CASE(code); } - else if (code==0xA2) ; else if ((EncISO_8859_4_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code>=0xA0&&code<=0xBF) { - if (code==0xBF) code -= 0x02; else code -= 0x10; @@ -265,11 +266,11 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_4, ISO_8859_4) = { @@ -289,8 +290,8 @@ OnigEncodingDefine(iso_8859_4, ISO_8859_4) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-4", "ISO-8859-4") @@ -210,35 +210,35 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; if ((EncISO_8859_5_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_5_TO_LOWER_CASE(code); } else if ((EncISO_8859_5_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (0xF1<=code && code<=0xFF) code -= 0x50; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_5, ISO_8859_5) = { @@ -258,8 +258,8 @@ OnigEncodingDefine(iso_8859_5, ISO_8859_5) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-5", "ISO-8859-5") @@ -93,9 +93,9 @@ OnigEncodingDefine(iso_8859_6, ISO_8859_6) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ISO8859-6", "ISO-8859-6") @@ -206,58 +206,58 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==0xF2) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; code = 0xD3; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; code = 0xF3; } } else if ((EncISO_8859_7_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_ISO_8859_7_TO_LOWER_CASE(code); } - else if (code==0xC0 || code==0xE0) - ; else if ((EncISO_8859_7_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0xDC) { - code-=0x26; } - else if (code>=0xDD && code<=0xDF) { - code-=0x25; } - else if (code==0xFC) { - code-=0x40; } - else if (code==0xFD || code==0xFE) { - code-=0x3F; } else { - code-=0x20; } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_7, ISO_8859_7) = { @@ -277,8 +277,8 @@ OnigEncodingDefine(iso_8859_7, ISO_8859_7) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-7", "ISO-8859-7") @@ -93,9 +93,9 @@ OnigEncodingDefine(iso_8859_8, ISO_8859_8) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ISO8859-8", "ISO-8859-8") @@ -204,9 +204,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -216,53 +216,54 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, #define DOTLESS_i (0xFD) #define I_WITH_DOT_ABOVE (0xDD) static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } - else if (code==0xAA || code==0xB5 || code==0xBA || code==0xFF) ; else if ((EncISO_8859_9_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='I') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? DOTLESS_i : 'i'; else code = ENC_ISO_8859_9_TO_LOWER_CASE(code); } else if ((EncISO_8859_9_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='i') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? I_WITH_DOT_ABOVE : 'I'; - else if (code==DOTLESS_i) code = 'I'; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(iso_8859_9, ISO_8859_9) = { @@ -282,8 +283,8 @@ OnigEncodingDefine(iso_8859_9, ISO_8859_9) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-9", "ISO-8859-9") @@ -214,9 +214,8 @@ OnigEncodingDefine(koi8_r, KOI8_R) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("CP878", "KOI8-R") - @@ -218,7 +218,7 @@ OnigEncodingDefine(koi8_u, KOI8_U) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; @@ -2,7 +2,7 @@ mktable.c **********************************************************************/ /*- - * Copyright (c) 2002-2007 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,7 +31,10 @@ #include <stdio.h> #include <locale.h> #define __USE_ISOC99 #include <ctype.h> #include "regenc.h" @@ -1108,11 +1111,13 @@ static int exec(FILE* fp, ENC_INFO* einfo) #define NCOL 8 int c, val, enc; enc = einfo->num; - fprintf(fp, "static const unsigned short Enc%s_CtypeTable[256] = {\n", - einfo->name); for (c = 0; c < 256; c++) { val = 0; @@ -1131,20 +1136,33 @@ static int exec(FILE* fp, ENC_INFO* einfo) if (IsWord (enc, c)) val |= BIT_CTYPE_WORD; if (IsAscii (enc, c)) val |= BIT_CTYPE_ASCII; - if (c % NCOL == 0) fputs(" ", fp); - fprintf(fp, "0x%04x", val); - if (c != 255) fputs(",", fp); if (c != 0 && c % NCOL == (NCOL-1)) - fputs("\n", fp); else - fputs(" ", fp); } - fprintf(fp, "};\n"); return 0; } extern int main(int argc ARG_UNUSED, char* argv[] ARG_UNUSED) { int i; FILE* fp = stdout; @@ -1155,7 +1173,11 @@ extern int main(int argc ARG_UNUSED, char* argv[] ARG_UNUSED) /* setlocale(LC_ALL, "fr_FR.iso88591"); */ for (i = 0; i < (int )(sizeof(Info)/sizeof(ENC_INFO)); i++) { - exec(fp, &Info[i]); } return 0; @@ -28,7 +28,7 @@ * SUCH DAMAGE. */ -#include "regint.h" static const int EncLen_SJIS[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -563,9 +563,9 @@ OnigEncodingDefine(shift_jis, Shift_JIS) = { get_ctype_code_range, left_adjust_char_head, is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: Shift_JIS @@ -139,17 +139,17 @@ code3_equal(const OnigCodePoint *x, const OnigCodePoint *y) /* macros related to ONIGENC_CASE flags */ /* defined here because not used in other files */ -#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE|ONIGENC_CASE_IS_TITLECASE|ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL) /* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ #define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ -#define SpecialsLengthExtract(n) ((n)>>SpecialsLengthOffset) -#define SpecialsCodepointExtract(n) ((n)&((1<<SpecialsLengthOffset)-1)) -#define SpecialsLengthEncode(n) ((n)<<SpecialsLengthOffset) -#define OnigSpecialIndexMask (((1<<OnigSpecialIndexWidth)-1)<<OnigSpecialIndexShift) -#define OnigSpecialIndexEncode(n) ((n)<<OnigSpecialIndexShift) -#define OnigSpecialIndexDecode(n) (((n)&OnigSpecialIndexMask)>>OnigSpecialIndexShift) /* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ #define U ONIGENC_CASE_UPCASE @@ -660,128 +660,130 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc) { - OnigCodePoint code; - OnigUChar *to_start = to; - OnigCaseFoldType flags = *flagP; - int codepoint_length; - - to_end -= CASE_MAPPING_SLACK; - /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to - * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ - flags |= (flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE))<<ONIGENC_CASE_SPECIAL_OFFSET; - - while (*pp<end && to<=to_end) { - codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); - if (codepoint_length < 0) - return codepoint_length; /* encoding invalid */ - code = ONIGENC_MBC_TO_CODE(enc, *pp, end); - *pp += codepoint_length; - - if (code<='z') { /* ASCII comes first */ - if (code>='a' && code<='z') { - if (flags&ONIGENC_CASE_UPCASE) { - MODIFIED; - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='i') - code = I_WITH_DOT_ABOVE; - else - code += 'A'-'a'; - } - } - else if (code>='A' && code<='Z') { - if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) { - MODIFIED; - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='I') - code = DOTLESS_i; - else - code += 'a'-'A'; - } - } } - else if (!(flags&ONIGENC_CASE_ASCII_ONLY) && code>=0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ - const CodePointList3 *folded; - - if (code==I_WITH_DOT_ABOVE) { - if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) { - MODIFIED; - code = 'i'; - if (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ - to += ONIGENC_CODE_TO_MBC(enc, code, to); - code = DOT_ABOVE; - } - } - } - else if (code==DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ - if (flags&ONIGENC_CASE_UPCASE) - MODIFIED, code = 'I'; } - else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ - if ((flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ - && (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ - /* already Titlecase, no changes needed */ - } - else if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ - const OnigCodePoint *next; - int count; - - MODIFIED; - if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_SPECIALS) { /* special */ - const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); - - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ - if ((flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) - == (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ - goto SpecialsCopy; - else /* swapCASE not needed */ - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) { /* Titlecase available */ - if (flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ - goto SpecialsCopy; - else /* Titlecase not needed */ - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_DOWN_SPECIAL) { - if (!(flags&ONIGENC_CASE_DOWN_SPECIAL)) - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ - SpecialsCopy: - count = SpecialsLengthExtract(*SpecialsStart); - next = SpecialsStart; - code = SpecialsCodepointExtract(*next++); - } - else { /* no specials */ - count = OnigCodePointCount(folded->n); - next = folded->code; - code = *next++; - } - if (count==1) - ; - else if (count==2) { - to += ONIGENC_CODE_TO_MBC(enc, code, to); - code = *next; - } - else { /* count == 3 */ - to += ONIGENC_CODE_TO_MBC(enc, code, to); - to += ONIGENC_CODE_TO_MBC(enc, *next++, to); - code = *next; - } - } } - else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0 /* data about character found in CaseUnfold_11_Table */ - && flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ - MODIFIED; - code = folded->code[(flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) ? 1 : 0]; } } - to += ONIGENC_CODE_TO_MBC(enc, code, to); - /* switch from titlecase to lowercase for capitalize */ - if (flags & ONIGENC_CASE_TITLECASE) - flags ^= (ONIGENC_CASE_UPCASE |ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE| - ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL); } - *flagP = flags; - return (int)(to-to_start); } #if 0 @@ -1,7 +1,10 @@ #include "regenc.h" -#include "encindex.h" #ifndef ENCINDEX_US_ASCII -#define ENCINDEX_US_ASCII 0 #endif static int @@ -29,9 +32,9 @@ OnigEncodingDefine(us_ascii, US_ASCII) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, ENCINDEX_US_ASCII, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ASCII", "US-ASCII") ENC_ALIAS("ANSI_X3.4-1968", "US-ASCII") @@ -249,8 +249,8 @@ OnigEncodingDefine(utf_16be, UTF_16BE) = { onigenc_utf16_32_get_ctype_code_range, utf16be_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("UCS-2BE", "UTF-16BE") @@ -242,7 +242,7 @@ OnigEncodingDefine(utf_16le, UTF_16LE) = { onigenc_utf16_32_get_ctype_code_range, utf16le_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; @@ -187,9 +187,8 @@ OnigEncodingDefine(utf_32be, UTF_32BE) = { onigenc_utf16_32_get_ctype_code_range, utf32be_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("UCS-4BE", "UTF-32BE") - @@ -187,8 +187,8 @@ OnigEncodingDefine(utf_32le, UTF_32LE) = { onigenc_utf16_32_get_ctype_code_range, utf32le_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("UCS-4LE", "UTF-32LE") @@ -28,17 +28,20 @@ */ #include "regenc.h" -#include "encindex.h" #ifndef ENCINDEX_UTF_8 -#define ENCINDEX_UTF_8 0 #endif #define USE_INVALID_CODE_SCHEME #ifdef USE_INVALID_CODE_SCHEME /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ -#define INVALID_CODE_FE 0xfffffffe -#define INVALID_CODE_FF 0xffffffff #endif #define VALID_CODE_LIMIT 0x0010ffff @@ -428,9 +431,9 @@ OnigEncodingDefine(utf_8, UTF_8) = { get_ctype_code_range, left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, ENCINDEX_UTF_8, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("CP65001", "UTF-8") @@ -444,4 +447,3 @@ ENC_ALIAS("CP65001", "UTF-8") ENC_REPLICATE("UTF8-MAC", "UTF-8") ENC_ALIAS("UTF-8-MAC", "UTF8-MAC") ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */ - @@ -191,40 +191,41 @@ cp1250_get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncCP1250_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_CP1250_TO_LOWER_CASE(code); } - else if (code==0xB5) ; else if ((EncCP1250_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0xB9) code = 0xA5; - else if (code==0xBE) code = 0xBC; else if (code >= 0x8A && code <= 0xBF && code!=0xB9) code -= 0x10; @@ -232,11 +233,11 @@ case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(windows_1250, Windows_1250) = { @@ -256,9 +257,9 @@ OnigEncodingDefine(windows_1250, Windows_1250) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; /* * Name: windows-1250 @@ -181,49 +181,50 @@ cp1251_get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; if ((EncCP1251_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_CP1251_TO_LOWER_CASE(code); } - else if (code==0xB5) ; else if ((EncCP1251_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if ((0x61<=code && code<=0x7A) || (0xE0<=code && code<=0xFF)) code -= 0x20; - else if (code==0xA2 || code==0xB3 || code==0xBE) code -= 0x01; - else if (code==0x83) code = 0x81; - else if (code==0xBC) code = 0xA3; - else if (code==0xB4) code = 0xA5; else code -= 0x10; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(windows_1251, Windows_1251) = { onigenc_single_byte_mbc_enc_len, - "Windows-1251", /* name */ 1, /* max enc length */ 1, /* min enc length */ onigenc_is_mbc_newline_0x0a, @@ -238,9 +239,9 @@ OnigEncodingDefine(windows_1251, Windows_1251) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; /* * Name: windows-1251 @@ -190,42 +190,43 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncCP1252_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_CP1252_TO_LOWER_CASE(code); } - else if (code==0x83 || code==0xAA || code==0xBA || code==0xB5) ; else if ((EncCP1252_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0x9A || code==0x9C || code==0x9E) code -= 0x10; - else if (code==0xFF) code -= 0x60; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(windows_1252, Windows_1252) = { @@ -245,9 +246,9 @@ OnigEncodingDefine(windows_1252, Windows_1252) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; /* * Name: windows-1252 @@ -214,62 +214,63 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==0xF2) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; code = 0xD3; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; code = 0xF3; } } - else if (code==0xB5) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; code = 0xCC; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; code = 0xEC; } } - else if (code==0xC0 || code==0xE0 || code==0xB6) ; else if ((EncCP1253_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; code = ENC_CP1253_TO_LOWER_CASE(code); } else if ((EncCP1253_CtypeTable[code] & BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code==0xDC) code = 0xA2; - else if (code>=0xDD && code<=0xDF) code -= 0x25; - else if (code==0xFC) code = 0xBC; - else if (code==0xFD || code==0xFE) code -= 0x3F; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(windows_1253, Windows_1253) = { @@ -289,8 +290,8 @@ OnigEncodingDefine(windows_1253, Windows_1253) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("CP1253", "Windows-1253") @@ -212,9 +212,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -232,49 +232,50 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } else if ((EncCP1254_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='I') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? DOTLESS_i : 'i'; else code = ENC_CP1254_TO_LOWER_CASE(code); } - else if (code==0x83 || code==0xAA || code==0xBA || code==0xB5) ; - else if ((EncCP1254_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='i') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? I_WITH_DOT_ABOVE : 'I'; - else if (code==DOTLESS_i) code = 'I'; - else if (code==0x9A || code==0x9C || code==0x9E) code -= 0x10; - else if (code==0xFF) code -= 0x60; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(windows_1254, Windows_1254) = { @@ -294,8 +295,8 @@ OnigEncodingDefine(windows_1254, Windows_1254) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("CP1254", "Windows-1254") @@ -216,9 +216,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -228,55 +228,56 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, #define DOTLESS_i (0xB9) #define I_WITH_DOT_ABOVE (0xA9) static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp<end && to<to_end) { code = *(*pp)++; - if (code==SHARP_s) { - if (flags&ONIGENC_CASE_UPCASE) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 'S'; - code = (flags&ONIGENC_CASE_TITLECASE) ? 's' : 'S'; } - else if (flags&ONIGENC_CASE_FOLD) { flags |= ONIGENC_CASE_MODIFIED; *to++ = 's'; code = 's'; } } - else if (code==0xB5) ; else if ((EncCP1252_CtypeTable[code] & BIT_CTYPE_UPPER) - && (flags & (ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='I') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? DOTLESS_i : 'i'; else code = ENC_CP1252_TO_LOWER_CASE(code); } else if ((EncCP1252_CtypeTable[code]&BIT_CTYPE_LOWER) - && (flags&ONIGENC_CASE_UPCASE)) { flags |= ONIGENC_CASE_MODIFIED; - if (code=='i') - code = flags&ONIGENC_CASE_FOLD_TURKISH_AZERI ? I_WITH_DOT_ABOVE : 'I'; - else if (code==DOTLESS_i) code = 'I'; - else if (code>=0xB0 && code<=0xBF ) code -= 0x10; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); } OnigEncodingDefine(windows_1257, Windows_1257) = { @@ -296,9 +297,8 @@ OnigEncodingDefine(windows_1257, Windows_1257) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - case_map, }; - ENC_ALIAS("CP1257", "Windows-1257") @@ -33,7 +33,7 @@ OnigEncodingDefine(windows_31j, Windows_31J) = { mbc_enc_len, - "Windows-31J", /* name */ 2, /* max byte length */ 1, /* min byte length */ onigenc_is_mbc_newline_0x0a, @@ -48,9 +48,9 @@ OnigEncodingDefine(windows_31j, Windows_31J) = { get_ctype_code_range, left_adjust_char_head, is_allowed_reverse_match, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: Windows-31J |