diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2009-10-13 12:27:00 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2009-10-13 12:27:00 +0000 |
commit | 181eb7d5c11571c811c6d57bfd99332334b6e1e8 () | |
tree | 56f1433ba9100f6ad5cde7310b836a73cba42ad1 /tool/enc-unicode.rb | |
parent | 391e5df571e8f5bcef3ee0cd227002056cbcbe5c (diff) |
Add derived core and binary property and aliases.
* tool/enc-unicode.rb, enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: Add DerivedCoreProperties, PropList (Binary Property), PropertyAlias and PropertyValueAlias. Now users of tool/enc-unicode.rb should specify the directory of UCD files. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rwxr-xr-x | tool/enc-unicode.rb | 104 |
1 files changed, 76 insertions, 28 deletions
@@ -3,15 +3,15 @@ # Creates the data structures needed by Onigurma to map Unicode codepoints to # property names and POSIX character classes # -# To use this, get UnicodeData.txt and Scripts.txt from unicode.org. # (http://unicode.org/Public/UNIDATA/) # And run following command. -# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd # You can get source file for gperf. # After this, simply make ruby. -unless ARGV.size == 2 - $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt" exit(1) end @@ -161,23 +161,49 @@ def parse_unicode_data(file) end -def parse_scripts(file) - script = nil data = [] names = [] - IO.foreach(file) do |line| - if /^# Total code points: / =~ line - make_const(script, pair_codepoints(data), 'Script') - names << script - data = [] - elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line - script = $3 - $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16)) end end names end # make_const(property, pairs, name): Prints a 'static const' structure for a # given property, group of paired codepoints, and a human-friendly name for # the group @@ -195,17 +221,23 @@ end def normalize_propname(name) name = name.downcase - name.gsub!(/[- _]/, '') name end puts '%{' -gcps, data = parse_unicode_data(ARGV[0]) POSIX_NAMES.each do |name| make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]") end print "\n#ifdef USE_UNICODE_PROPERTIES" -gcps.each do |name| category = case name.size when 1 then 'Major Category' @@ -214,18 +246,19 @@ gcps.each do |name| end make_const(name, pair_codepoints(data[name]), category) end -scripts = parse_scripts(ARGV[1]) -puts "#endif /* USE_UNICODE_PROPERTIES */" -puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {" POSIX_NAMES.each{|name|puts" CR_#{name},"} puts "#ifdef USE_UNICODE_PROPERTIES" -gcps.each{|name|puts" CR_#{name},"} -scripts.each{|name|puts" CR_#{name},"} -puts "#endif /* USE_UNICODE_PROPERTIES */" -puts "};" puts(<<'__HEREDOC') struct uniname2ctype_struct { int name, ctype; }; @@ -236,12 +269,27 @@ struct uniname2ctype_struct; %% __HEREDOC i = -1 -POSIX_NAMES.each {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]} puts "#ifdef USE_UNICODE_PROPERTIES" -gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]} -scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]} -puts "#endif /* USE_UNICODE_PROPERTIES */\n" puts(<<'__HEREDOC') %% static int uniname2ctype(const UChar *name, unsigned int len) |