diff options
author | ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-01-20 03:56:02 +0000 |
---|---|---|
committer | ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-01-20 03:56:02 +0000 |
commit | fa4bfa6af585589e4465831f1489fee83ce26f09 () | |
tree | fabaa77b102a6a2b93bdb79b70c2fe5dfe21763e /lib | |
parent | f700c1354f19ca5ad73f4e119dcbff493a3e6e00 (diff) |
Merged from REXML main repository:
Fixes ticket:68. NOTE that this involves an API change! Entity declarations in the doctype now generate events that carry two, not one, arguments. Implements ticket:15, using gwrite's suggestion. This allows Element to be subclassed. Two unrelated changes, because subversion is retarded and doesn't do block-level commits: 1) Fixed a typo bug in previous change for ticket:15 2) Fixed namespaces handling in XPath and element. ***** Note that this is an API change!!! ***** Element.namespaces() now returns a hash of namespace mappings which are relevant for that node. Fixes a bug in multiple decodings The changeset 1230:1231 was bad. The default behavior is *not* to use the native REXML encodings by default, but rather to use ICONV by default. I know that this will piss some people off, but defaulting to the pure Ruby version isn't the correct solution, and it breaks other encodings, so I've reverted it. * Fixes ticket:61 (xpath_parser) * Fixes ticket:63 (UTF-16; UNILE decoding was bad) * Cleans up some tests, removing opportunities for test corruption * Improves parsing error messages a little * Adds the ability to override the encoding detection in Source construction * Fixes an edge case in Functions::string, where document nodes weren't correctly converted * Fixes Functions::string() for Element and Document nodes * Fixes some problems in entity handling Addresses ticket:66 Fixes ticket:71 Addresses ticket:78 NOTE: that this also fixes what is technically another bug in REXML. REXML's XPath parser used to allow exponential notation in numbers. The XPath spec is specific about what a number is, and scientific notation is not included. Therefore, this has been fixed. Cross-ported a fix for ticket:88 from CVS. Fixes ticket:80 Documentation cleanup. Ticket:84 Applied Kou's fix for an un-trac'ed bug. ------------------------------------------------------------------------ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@11548 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | lib/rexml/document.rb | 5 | ||||
-rw-r--r-- | lib/rexml/element.rb | 25 | ||||
-rw-r--r-- | lib/rexml/encoding.rb | 26 | ||||
-rw-r--r-- | lib/rexml/encodings/UNILE.rb | 2 | ||||
-rw-r--r-- | lib/rexml/functions.rb | 25 | ||||
-rw-r--r-- | lib/rexml/node.rb | 6 | ||||
-rw-r--r-- | lib/rexml/parsers/baseparser.rb | 4 | ||||
-rw-r--r-- | lib/rexml/parsers/sax2parser.rb | 4 | ||||
-rw-r--r-- | lib/rexml/parsers/treeparser.rb | 3 | ||||
-rw-r--r-- | lib/rexml/sax2listener.rb | 2 | ||||
-rw-r--r-- | lib/rexml/source.rb | 23 | ||||
-rw-r--r-- | lib/rexml/text.rb | 47 | ||||
-rw-r--r-- | lib/rexml/xpath_parser.rb | 53 |
13 files changed, 142 insertions, 83 deletions
@@ -157,8 +157,9 @@ module REXML # document will be written. # indent:: # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. Defaults to -1 # transitive:: # If transitive is true and indent is >= 0, then the output will be # pretty-printed in such a way that the added whitespace does not affect @@ -94,7 +94,7 @@ module REXML # new_a = d.root.clone # puts new_a # => "<a/>" def clone - Element.new self end # Evaluates to the root node of the document that this element @@ -200,9 +200,9 @@ module REXML end def namespaces - namespaces = [] namespaces = parent.namespaces if parent - namespaces |= attributes.namespaces return namespaces end @@ -494,13 +494,12 @@ module REXML # doc.root.add_element 'c' #-> '<a><b/>Elliott<c/></a>' # doc.root.text = 'Russell' #-> '<a><b/>Russell<c/></a>' # doc.root.text = nil #-> '<a><b/><c/></a>' - def text=( text ) if text.kind_of? String text = Text.new( text, whitespace(), nil, raw() ) elsif text and !text.kind_of? Text text = Text.new( text.to_s, whitespace(), nil, raw() ) end - old_text = get_text if text.nil? old_text.remove unless old_text.nil? @@ -557,13 +556,9 @@ module REXML ################################################# def attribute( name, namespace=nil ) - prefix = '' - if namespace - prefix = attributes.prefixes.each { |prefix| - return "#{prefix}:" if namespace( prefix ) == namespace - } || '' - end - attributes.get_attribute( "#{prefix}#{name}" ) end # Evaluates to +true+ if this element has any attributes set, false @@ -1172,16 +1167,16 @@ module REXML end def namespaces - namespaces = [] each_attribute do |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end if @element.document and @element.document.doctype expn = @element.expanded_name expn = @element.document.doctype.name if expn.size == 0 @element.document.doctype.attributes_of(expn).each { |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } end namespaces @@ -24,20 +24,20 @@ module REXML old_verbosity = $VERBOSE begin $VERBOSE = false - enc = enc.nil? ? nil : enc.upcase return false if defined? @encoding and enc == @encoding if enc and enc != UTF_8 - @encoding = enc - raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ - @encoding.untaint - enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) - begin - require enc_file - Encoding.apply(self, @encoding) rescue LoadError, Exception - begin - require 'rexml/encodings/ICONV.rb' - Encoding.apply(self, "ICONV") rescue LoadError => err puts err.message raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." @@ -51,14 +51,14 @@ module REXML ensure $VERBOSE = old_verbosity end - true end def check_encoding str # We have to recognize UTF-16, LSB UTF-16, and UTF-8 return UTF_16 if /\A\xfe\xff/n =~ str return UNILE if /\A\xff\xfe/n =~ str - str =~ /^\s*<?xml\s*version\s*=\s*(['"]).*?\2\s*encoding\s*=\s*(["'])(.*?)\2/um return $1.upcase if $1 return UTF_8 end @@ -18,7 +18,7 @@ module REXML def decode_unile(str) array_enc=str.unpack('C*') array_utf8 = [] - 2.step(array_enc.size-1, 2){|i| array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100) } array_utf8.pack('U*') @@ -117,16 +117,30 @@ module REXML elsif defined? object.node_type if object.node_type == :attribute object.value - elsif object.node_type == :element - object.text else object.to_s end else object.to_s end end # UNTESTED def Functions::concat( *objects ) objects.join @@ -139,7 +153,7 @@ module REXML # Fixed by Mike Stok def Functions::contains( string, test ) - string(string).include? string(test) end # Kouhei fixed this @@ -325,8 +339,9 @@ module REXML object.to_f else str = string( object ) - #puts "STRING OF #{object.inspect} = #{str}" - if str =~ /^-?\.?\d/ str.to_f else (0.0 / 0.0) @@ -55,10 +55,8 @@ module REXML return nil end - # Returns the index that +self+ has in its parent's elements array, so that - # the following equation holds true: - # - # node == node.parent.elements[node.index_in_parent] def index_in_parent parent.index(self)+1 end @@ -146,8 +146,6 @@ module REXML # Returns true if there are no more events def empty? - #STDERR.puts "@source.empty? = #{@source.empty?}" - #STDERR.puts "@stack.empty? = #{@stack.empty?}" return (@source.empty? and @stack.empty?) end @@ -365,8 +363,6 @@ module REXML else md = @source.match( TEXT_PATTERN, true ) if md[0].length == 0 - puts "EMPTY = #{empty?}" - puts "BUFFER = \"#{@source.buffer}\"" @source.match( /(\s+)/, true ) end #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 @@ -16,6 +16,10 @@ module REXML @tag_stack = [] @entities = {} end def add_listener( listener ) @parser.add_listener( listener ) @@ -23,7 +23,8 @@ module REXML case event[0] when :end_document unless tag_stack.empty? - raise ParseException.new("No close tag for #{tag_stack.inspect}") end return when :start_element @@ -70,7 +70,7 @@ module REXML # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] - def entitydecl content end # <!NOTATION ...> def notationdecl content @@ -6,7 +6,7 @@ module REXML # Generates a Source object # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given - def SourceFactory::create_from arg#, slurp=true if arg.kind_of? String Source.new(arg) elsif arg.respond_to? :read and @@ -35,12 +35,19 @@ module REXML # Constructor # @param arg must be a String, and should be a valid XML document - def initialize(arg) @orig = @buffer = arg - self.encoding = check_encoding( @buffer ) @line = 0 end # Inherited from Encoding # Overridden to support optimized en/decoding def encoding=(enc) @@ -124,7 +131,7 @@ module REXML #attr_reader :block_size # block_size has been deprecated - def initialize(arg, block_size=500) @er_source = @source = arg @to_utf = false # Determining the encoding is a deceptively difficult issue to resolve. @@ -134,10 +141,12 @@ module REXML # if there is one. If there isn't one, the file MUST be UTF-8, as per # the XML spec. If there is one, we can determine the encoding from # it. str = @source.read( 2 ) - if /\A(?:\xfe\xff|\xff\xfe)/n =~ str self.encoding = check_encoding( str ) - @line_break = encode( '>' ) else @line_break = '>' end @@ -159,6 +168,8 @@ module REXML str = @source.readline(@line_break) str = decode(str) if @to_utf and str @buffer << str rescue @source = nil end @@ -42,6 +42,7 @@ module REXML # Use this field if you have entities defined for some text, and you don't # want REXML to escape that text in output. # Text.new( "<&", false, nil, false ) #-> "<&" # Text.new( "<&", false, nil, true ) #-> Parse exception # Text.new( "<&", false, nil, true ) #-> "<&" # # Assume that the entity "s" is defined to be "sean" @@ -172,17 +173,6 @@ module REXML end @unnormalized = Text::unnormalize( @string, doctype ) end - - def wrap(string, width, addnewline=false) - # Recursivly wrap string at width. - return string if string.length <= width - place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) - else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) - end - end # Sets the contents of this text node. This expects the text to be # unnormalized. It returns self. @@ -198,17 +188,28 @@ module REXML @raw = false end - def indent_text(string, level=1, style="\t", indentfirstline=true) - return string if level < 0 - new_string = '' - string.each { |line| - indent_string = style * level - new_line = (indent_string + line).sub(/[\s]+$/,'') - new_string << new_line - } - new_string.strip! unless indentfirstline - return new_string end def write( writer, indent=-1, transitive=false, ie_hack=false ) s = to_s() @@ -286,9 +287,10 @@ module REXML def Text::normalize( input, doctype=nil, entity_filter=nil ) copy = input # Doing it like this rather than in a loop improves the speed if doctype # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) doctype.entities.each_value do |entity| copy = copy.gsub( entity.value, "&#{entity.name};" ) if entity.value and @@ -296,7 +298,6 @@ module REXML end else # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) DocType::DEFAULT_ENTITIES.each_value do |entity| copy = copy.gsub(entity.value, "&#{entity.name};" ) end @@ -160,8 +160,13 @@ module REXML node_types = ELEMENTS return nodeset if path_stack.length == 0 || nodeset.length == 0 while path_stack.length > 0 #puts "Path stack = #{path_stack.inspect}" #puts "Nodeset is #{nodeset.inspect}" case (op = path_stack.shift) when :document nodeset = [ nodeset[0].root_node ] @@ -235,9 +240,11 @@ module REXML name = path_stack.shift for element in nodeset if element.node_type == :element - #puts element.name - attr = element.attribute( name, get_namespace(element, prefix) ) - new_nodeset << attr if attr end end when :any @@ -299,8 +306,10 @@ module REXML #puts "Adding node #{node.inspect}" if result == (index+1) new_nodeset << node if result == (index+1) elsif result.instance_of? Array - #puts "Adding node #{node.inspect}" if result.size > 0 - new_nodeset << node if result.size > 0 else #puts "Adding node #{node.inspect}" if result new_nodeset << node if result @@ -381,9 +390,25 @@ module REXML node_types = ELEMENTS when :namespace - new_set = [] for node in nodeset - new_nodeset << node.namespace if node.node_type == :element or node.node_type == :attribute end nodeset = new_nodeset @@ -404,6 +429,18 @@ module REXML #puts "RES => #{res.inspect}" return res when :div left = Functions::number(expr(path_stack.shift, nodeset, context)).to_f right = Functions::number(expr(path_stack.shift, nodeset, context)).to_f @@ -477,7 +514,7 @@ module REXML # The next two methods are BAD MOJO! # This is my achilles heel. If anybody thinks of a better # way of doing this, be my guest. This really sucks, but - # it took me three days to get it to work at all. # ######################################################## def descendant_or_self( path_stack, nodeset ) |