Ticket #95: libxml_VS_Hpricot_VS_REXML_from_xml.rb

File libxml_VS_Hpricot_VS_REXML_from_xml.rb, 12.1 kB (added by chr..@octopod.info, 1 year ago)

The previous benchmark with a slightly modified version of the libxml sax code that uses REXML fast stream parsing

Line 
1 require 'rubygems'
2 gem 'libxml-ruby', '<=0.3.8.4'
3 require 'xml/libxml'
4 require 'benchmark'
5 require 'hpricot'
6 require 'active_record'
7 require 'open-uri'
8 require 'rexml/parsers/streamparser'
9 require 'rexml/parsers/baseparser'
10 require 'rexml/light/node'
11
12
13 # xml_feed = "http://feeds.feedburner.com/37signals/beMH" # Hpricot is faster for this feed
14 # xml_feed = "http://feeds.gawker.com/gizmodo/excerpts.xml"
15 # xml_feed = "http://brainspl.at/xml/rss20/feed.xml"
16 xml_feed = "http://jobs.37signals.com/categories/2/jobs;rss"
17
18 @xml = open( xml_feed ).read
19
20 class REXMLUtilityNode
21   attr_accessor :name, :attributes, :children
22  
23   def initialize(name, attributes = {})
24     @name       = name.tr("-", "_")
25     @attributes = undasherize_keys(attributes)
26     @children   = []
27     @text       = false
28   end
29  
30   def add_node(node)
31     @text = true if node.is_a? String
32     @children << node
33   end
34  
35   def to_hash
36     if @text
37       return { name => typecast_value( translate_xml_entities( inner_html ) ) }
38     else
39       #change repeating groups into an array
40       # group by the first key of each element of the array to find repeating groups
41       groups = @children.group_by{ |c| c.name }
42
43       hash = {} 
44       groups.each do |key, values|
45         if values.size == 1
46           hash.merge!( values.first )
47         else
48           hash.merge!( key => values.map{ |element| element.to_hash[key] } )
49         end
50       end
51
52       # merge the arrays, including attributes
53       hash.merge!( attributes ) unless attributes.empty?
54       return { name => hash }
55     end
56   end
57
58   def to_s
59     self.to_html
60   end
61
62
63   def typecast_value(value)
64     return value unless attributes["type"]
65    
66     case attributes["type"]
67       when "integer"  then value.to_i
68       when "boolean"  then value.strip == "true"
69       when "datetime" then ::Time.parse(value).utc
70       when "date"     then ::Date.parse(value)
71       else                 value
72     end
73   end
74
75   def translate_xml_entities(value)
76     value.gsub(/&lt;/,   "<").
77           gsub(/&gt;/,   ">").
78           gsub(/&quot;/, '"').
79           gsub(/&apos;/, "'").
80           gsub(/&amp;/,  "&")
81   end
82
83    def undasherize_keys(params)
84      params.keys.each do |key, vvalue|
85        params[key.tr("-", "_")] = params.delete(key)
86      end
87      params
88   end
89
90   def inner_html
91     @children.join
92   end
93
94   def to_html
95     "<#{name}#{attributes_to_xml}>#{inner_html}</#{name}>"
96   end
97
98   def attributes_to_xml
99     attributes.keys.map do |key|
100       "#{key}='#{attributes[key]}'"
101     end.join
102   end 
103 end
104
105 class ToHashParser
106         def self.from_xml(xml)
107                 stack = []
108                 parser = REXML::Parsers::BaseParser.new(xml)
109                
110                 while true
111                         event = parser.pull
112                         case event[0]
113                         when :end_document
114                                 break
115                         when :end_doctype, :start_doctype
116                                 # do nothing
117                         when :start_element
118                           stack.push REXMLUtilityNode.new(event[1], event[2])
119                         when :end_element
120                           if stack.size > 1
121                             temp = stack.pop
122                             stack.last.add_node(temp)
123                           end
124                         when :text
125                           stack.last.add_node(event[1]) unless event[1].strip.length == 0
126                         end
127                 end
128                 stack.pop.to_hash
129         end     
130 end
131
132
133 class XMLUtilityNode #:nodoc:
134
135    def self.parser
136      @@parser ||= XML::SaxParser.new
137    end
138
139    def self.stack
140      @@stack ||= []
141    end
142
143    parser.on_start_element{ |e,a| stack.push( XMLUtilityNode.new( e,a )) }
144    parser.on_characters{ |t| stack.last.add_node( t ) unless t.strip.length == 0 }
145
146    parser.on_end_element do
147      if stack.size > 1
148        tmp = stack.pop
149        stack.last.add_node( tmp )
150      end
151    end
152
153    def self.hash_from_xml( xml )
154      parser.string = xml
155      parser.parse
156      stack.pop.to_hash
157    end
158
159    attr_accessor :name, :attrs
160    def initialize( _name, _attrs)
161      @name = _name.tr("-", "_")
162      @attrs = undasherize_keys( _attrs )
163      @children = []
164      @text = false
165    end
166
167    def attrs
168      @attrs ||= {}
169    end
170
171
172    def add_node( attr )
173      @text = true if attr.kind_of?( String )
174      @children << attr
175    end
176
177    def text?
178      @text
179    end
180
181    def to_hash
182      if @text
183        return { name => typecast_value( translate_xml_entities( inner_html ) ) }
184       else
185        #change repeating groups into an array
186        # group by the first key of each element of the array to find repeating groups
187        
188        grps = @children.group_by{ |c| c.name }
189
190        out = {} 
191        grps.each do |k, v|
192          if v.size == 1
193            out.merge!( v.first )
194          else
195            out.merge!( k => v.map{ |elem| elem.to_hash[k] } )
196          end
197        end
198
199        # merge the arrays, including attributes
200        out.merge!( attrs ) unless attrs.empty?
201       return { name => out }
202      end
203    end
204
205    def to_s
206      self.to_html
207    end
208
209
210    def typecast_value( value )
211      return value unless attrs["type"]
212      
213      case attrs["type"]
214        when "integer"  then value.to_i
215        when "boolean"  then value.strip == "true"
216        when "datetime" then ::Time.parse(value).utc
217        when "date"     then ::Date.parse(value)
218        else                 value
219      end
220    end
221
222    def translate_xml_entities(value)
223      value.gsub(/&lt;/,   "<").
224            gsub(/&gt;/,   ">").
225            gsub(/&quot;/, '"').
226            gsub(/&apos;/, "'").
227            gsub(/&amp;/,  "&")
228    end
229
230     def undasherize_keys(params)
231       params.keys.each do |k, v|
232         params[k.tr("-", "_")] =  params.delete(k)
233       end
234       params
235    end
236
237    def inner_html
238      @children.join
239    end
240
241    def to_html
242      "<#{name}#{attributes_to_xml}>#{inner_html}</#{name}>"
243    end
244
245    def attributes_to_xml
246      attrs.keys.map do |k|
247        "#{k}='#{attrs[k]}'"
248      end.join
249    end
250 end
251
252
253  def do_from_xml
254    XMLUtilityNode.hash_from_xml( @xml )
255  end
256
257
258
259  class Hash
260
261    class << self
262        # Converts valid XML into a Ruby Hash structure.
263        # <tt>xml</tt>:: A string representation of valid XML
264        #
265        # == Typecasting
266        # Typecasting is performed on elements that have a "<tt>type</tt>" attribute of
267        # <tt>integer</tt>::
268        # <tt>boolean</tt>:: anything other than "true" evaluates to false
269        # <tt>datetime</tt>:: Returns a Time object.  See +Time+ documentation for valid Time strings
270        # <tt>date</tt>:: Returns a Date object.  See +Date+ documentation for valid Date strings
271        #
272        # Keys are automatically converted to +snake_case+
273        #
274        # == Caveats
275        # * Mixed content tags are assumed to be text and any xml tags are kept as a String
276        # * Any attributes other than type on a node containing a text node will be discarded
277        #
278        # == Examples
279        #
280        # ===Standard
281        # <user gender='m'>
282        #   <age type='integer'>35</age>
283        #   <name>Homer Simpson</name>
284        #   <dob type='date'>1988-01-01</dob>
285        #   <joined-at type='datetime'>2000-04-28 23:01</joined-at>
286        #   <is-cool type='boolean'>true</is-cool>
287        # </user>
288        #
289        # evaluates to
290        #
291        # { "user" =>
292        #         { "gender"    => "m",
293        #           "age"       => 35,
294        #           "name"      => "Homer Simpson",
295        #           "dob"       => DateObject( 1998-01-01 ),
296        #           "joined_at" => TimeObject( 2000-04-28 23:01),
297        #           "is_cool"   => true
298        #         }
299        #     }
300        #
301        # === Mixed Content
302        # <story>
303        #   A Quick <em>brown</em> Fox
304        # </story>
305        #
306        # evaluates to
307        # { "story" => "A Quick <em>brown</em> Fox" }
308        #
309        # === Attributes other than type on a node containing text
310        # <story is-good='fasle'>
311        #   A Quick <em>brown</em> Fox
312        # </story>
313        #
314        # evaluates to
315        # { "story" => "A Quick <em>brown</em> Fox" }
316        #
317        # <bicep unit='inches' type='integer'>60</bicep>
318        #
319        # evaluates with a typecast to an integer.  But ignores the unit attribute
320        # { "bicep" => 60 }
321
322        def from_xml( xml )
323          undasherize_keys(  Hpricot::XML( xml ).root.to_hash  )   
324        end
325
326      private
327
328        def undasherize_keys(params)
329          case params.class.to_s
330          when "Hash"
331            params.inject({}) do |h,(k,v)|
332              h[k.to_s.tr("-", "_")] = undasherize_keys(v)
333              h
334            end
335          when "Array"
336            params.map { |v| undasherize_keys(v) }
337          else
338            params
339          end
340       end
341    end
342
343    def to_params
344      result = ''
345      stack = []
346
347      each do |key, value|
348        Hash === value ? stack << [key, value] : result <<  "#{key}=#{value}&"
349      end
350
351      stack.each do |parent, hash|
352        hash.each do |key, value|
353          if Hash === value
354            stack << ["#{parent}[#{key}]", value]
355          else
356            result << "#{parent}[#{key}]=#{value}&"
357          end
358        end
359      end
360      result.chop
361    end
362
363    # lets through the keys in the argument
364    # >> {:one => 1, :two => 2, :three => 3}.pass(:one)
365    # => {:one=>1}
366    def pass(*allowed)
367      self.reject { |k,v| ! allowed.include?(k) }
368    end
369    alias only pass
370
371    # blocks the keys in the arguments
372    # >> {:one => 1, :two => 2, :three => 3}.block(:one)
373    # => {:two=>2, :three=>3}
374    def block(*rejected)
375      self.reject { |k,v| rejected.include?(k) }
376    end
377    alias except block
378
379
380    # Destructively convert all keys to symbols recursively.
381    def symbolize_keys!
382      keys.each do |key|
383        unless key.is_a?(Symbol)
384          self[key.to_sym] = self[key]
385          delete(key)
386        end
387        if Hash === (sub = self[key.to_sym])
388          sub.symbolize_keys!
389        end 
390      end
391      self
392    end
393
394   def method_missing(m,*a)
395     m.to_s =~ /=$/ ? self[$`]=a[0] : a==[] ? self[m] : raise(NoMethodError,"#{m}")
396   end
397
398    def respond_to?(method, include_private=false)
399      return true if keys.include?(method)
400      super(method, include_private)
401    end
402
403  end 
404
405
406  class Hpricot::Elem 
407    # Converts this Hpricto::Elem to a hash. 
408    # If the element contains mixed content.  i.e. tags and text, the innerHTML will
409    # be used.
410    def to_hash
411      result = {}
412      # Get any attributes of the tag out and into the hash
413      if !attributes.nil?
414        result[name].merge!( attributes ) unless result[name].nil?
415        result[name] = attributes if result[name].nil?
416      end
417
418
419      if children.detect{ |a| a.text? && a.to_s.strip.length != 0 }
420        # There is text nodes in here.
421        # Just return the innerHTML of the node, typecasted if required
422        content = translate_xml_entities( inner_html.strip )
423        result[name] = case attributes["type"]
424
425          when "integer"  then content.to_i
426          when "boolean"  then content.strip == "true"
427          when "datetime" then ::Time.parse(content).utc
428          when "date"     then ::Date.parse(content)
429          else                 content
430        end
431
432      else     
433        # Get the children in on the action
434
435        child_array = children.map do |child|
436          child_hash = {}
437          if !child.text?
438            child_hash = child.to_hash
439          end
440          # If there was no hash found for this child then make it nil so the compact will take it out
441          child_hash.empty? ? nil : child_hash
442        # We're grouping by the first key to see if this element has any duplicate child tag names
443        end.compact.group_by{ |child| child.keys.first }
444
445        # If any hashes in the child_array have a duplicate tag name collect these into an array
446        child_array.keys.each do |a|
447          if child_array[a].size == 1
448            # There is only one of these elements
449            # Grab it amd put it into the result
450            result[name].merge!( child_array[a].first )
451          else
452            # There are more than one of these elements
453            # They need to be put into an array
454            result[name].merge!( { a => child_array[a].map{ |child| child[child.keys.first] }} )
455          end
456        end
457
458      end
459      result
460    end
461
462
463    private
464    def translate_xml_entities(value)
465      value.gsub(/&lt;/,   "<").
466            gsub(/&gt;/,   ">").
467            gsub(/&quot;/, '"').
468            gsub(/&apos;/, "'").
469            gsub(/&amp;/,  "&")
470    end
471  end
472
473
474
475
476 n = 100
477
478 Benchmark.bmbm do |x|
479   x.report( 'sax' )        { n.times { XMLUtilityNode.hash_from_xml( @xml ) }}
480   x.report( 'hpricot')     { n.times { Hash.from_xml( @xml ) }}
481   x.report( 'rexml based') { n.times { ToHashParser.from_xml(@xml) } }
482 end
483