Add derived core and binary property and aliases.

* tool/enc-unicode.rb, enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: Add DerivedCoreProperties, PropList (Binary Property), PropertyAlias and PropertyValueAlias. Now users of tool/enc-unicode.rb should specify the directory of UCD files. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25324 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2009-10-13 12:27:00 +0000
committer: naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2009-10-13 12:27:00 +0000
commit: 181eb7d5c11571c811c6d57bfd99332334b6e1e8 ()
tree: 56f1433ba9100f6ad5cde7310b836a73cba42ad1 /tool/enc-unicode.rb
parent: 391e5df571e8f5bcef3ee0cd227002056cbcbe5c (diff)
1 files changed, 76 insertions, 28 deletions
@@ -3,15 +3,15 @@
 # Creates the data structures needed by Onigurma to map Unicode codepoints to
 # property names and POSIX character classes
 #
-# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
 # (http://unicode.org/Public/UNIDATA/)
 # And run following command.
-# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
 # You can get source file for gperf.
 # After this, simply make ruby.
-unless ARGV.size == 2
- $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
 exit(1)
 end
@@ -161,23 +161,49 @@ def parse_unicode_data(file)
 end
-def parse_scripts(file)
- script = nil
 data = []
 names = []
- IO.foreach(file) do |line|
- if /^# Total code points: / =~ line
- make_const(script, pair_codepoints(data), 'Script')
- names << script
- data = []
- elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
- script = $3
- $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
 end
 end
 names
 end
 # make_const(property, pairs, name): Prints a 'static const' structure for a
 # given property, group of paired codepoints, and a human-friendly name for
 # the group
@@ -195,17 +221,23 @@ end
 def normalize_propname(name)
 name = name.downcase
- name.gsub!(/[- _]/, '')
 name
 end
 puts '%{'
-gcps, data = parse_unicode_data(ARGV[0])
 POSIX_NAMES.each do |name|
 make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
 end
 print "\n#ifdef USE_UNICODE_PROPERTIES"
-gcps.each do |name|
 category =
 case name.size
 when 1 then 'Major Category'
@@ -214,18 +246,19 @@ gcps.each do |name|
 end
 make_const(name, pair_codepoints(data[name]), category)
 end
-scripts = parse_scripts(ARGV[1])
-puts "#endif /* USE_UNICODE_PROPERTIES */"
-puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
 POSIX_NAMES.each{|name|puts" CR_#{name},"}
 puts "#ifdef USE_UNICODE_PROPERTIES"
-gcps.each{|name|puts" CR_#{name},"}
-scripts.each{|name|puts" CR_#{name},"}
-puts "#endif /* USE_UNICODE_PROPERTIES */"
-puts "};"
 puts(<<'__HEREDOC')
 struct uniname2ctype_struct {
 int name, ctype;
 };
@@ -236,12 +269,27 @@ struct uniname2ctype_struct;
 %%
 __HEREDOC
 i = -1
-POSIX_NAMES.each {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
 puts "#ifdef USE_UNICODE_PROPERTIES"
-gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
-scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
-puts "#endif /* USE_UNICODE_PROPERTIES */\n"
 puts(<<'__HEREDOC')
 %%
 static int
 uniname2ctype(const UChar *name, unsigned int len)
author	naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2009-10-13 12:27:00 +0000
committer	naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2009-10-13 12:27:00 +0000
commit	181eb7d5c11571c811c6d57bfd99332334b6e1e8 ()
tree	56f1433ba9100f6ad5cde7310b836a73cba42ad1 /tool/enc-unicode.rb
parent	391e5df571e8f5bcef3ee0cd227002056cbcbe5c (diff)