Ticket #95: benchmark_with_confidence_intervals.rb

File benchmark_with_confidence_intervals.rb, 13.1 kB (added by jon.egil.stra..@gmail.com, 1 year ago)
Line 
1 require 'rubygems'
2 gem 'libxml-ruby', '<=0.3.8.4'
3 require 'xml/libxml'
4 require 'benchmark'
5 require 'hpricot'
6 require 'active_record'
7 require 'open-uri'
8 require 'rexml/parsers/streamparser'
9 require 'rexml/parsers/baseparser'
10 require 'rexml/light/node'
11
12
13
14 class REXMLUtilityNode
15   attr_accessor :name, :attributes, :children
16  
17   def initialize(name, attributes = {})
18     @name       = name.tr("-", "_")
19     @attributes = undasherize_keys(attributes)
20     @children   = []
21     @text       = false
22   end
23  
24   def add_node(node)
25     @text = true if node.is_a? String
26     @children << node
27   end
28  
29   def to_hash
30     if @text
31       return { name => typecast_value( translate_xml_entities( inner_html ) ) }
32     else
33       #change repeating groups into an array
34       # group by the first key of each element of the array to find repeating groups
35       groups = @children.group_by{ |c| c.name }
36
37       hash = {} 
38       groups.each do |key, values|
39         if values.size == 1
40           hash.merge!( values.first )
41         else
42           hash.merge!( key => values.map{ |element| element.to_hash[key] } )
43         end
44       end
45
46       # merge the arrays, including attributes
47       hash.merge!( attributes ) unless attributes.empty?
48       return { name => hash }
49     end
50   end
51
52   def to_s
53     self.to_html
54   end
55
56
57   def typecast_value(value)
58     return value unless attributes["type"]
59    
60     case attributes["type"]
61       when "integer"  then value.to_i
62       when "boolean"  then value.strip == "true"
63       when "datetime" then ::Time.parse(value).utc
64       when "date"     then ::Date.parse(value)
65       else                 value
66     end
67   end
68
69   def translate_xml_entities(value)
70     value.gsub(/&lt;/,   "<").
71           gsub(/&gt;/,   ">").
72           gsub(/&quot;/, '"').
73           gsub(/&apos;/, "'").
74           gsub(/&amp;/,  "&")
75   end
76
77    def undasherize_keys(params)
78      params.keys.each do |key, vvalue|
79        params[key.tr("-", "_")] = params.delete(key)
80      end
81      params
82   end
83
84   def inner_html
85     @children.join
86   end
87
88   def to_html
89     "<#{name}#{attributes_to_xml}>#{inner_html}</#{name}>"
90   end
91
92   def attributes_to_xml
93     attributes.keys.map do |key|
94       "#{key}='#{attributes[key]}'"
95     end.join
96   end 
97 end
98
99 class ToHashParser
100         def self.from_xml(xml)
101                 stack = []
102                 parser = REXML::Parsers::BaseParser.new(xml)
103                
104                 while true
105                         event = parser.pull
106                         case event[0]
107                         when :end_document
108                                 break
109                         when :end_doctype, :start_doctype
110                                 # do nothing
111                         when :start_element
112                           stack.push REXMLUtilityNode.new(event[1], event[2])
113                         when :end_element
114                           if stack.size > 1
115                             temp = stack.pop
116                             stack.last.add_node(temp)
117                           end
118                         when :text
119                           stack.last.add_node(event[1]) unless event[1].strip.length == 0
120                         end
121                 end
122                 stack.pop.to_hash
123         end     
124 end
125
126
127 class XMLUtilityNode #:nodoc:
128
129    def self.parser
130      @@parser ||= XML::SaxParser.new
131    end
132
133    def self.stack
134      @@stack ||= []
135    end
136
137    parser.on_start_element{ |e,a| stack.push( XMLUtilityNode.new( e,a )) }
138    parser.on_characters{ |t| stack.last.add_node( t ) unless t.strip.length == 0 }
139
140    parser.on_end_element do
141      if stack.size > 1
142        tmp = stack.pop
143        stack.last.add_node( tmp )
144      end
145    end
146
147    def self.hash_from_xml( xml )
148      parser.string = xml
149      parser.parse
150      stack.pop.to_hash
151    end
152
153    attr_accessor :name, :attrs
154    def initialize( _name, _attrs)
155      @name = _name.tr("-", "_")
156      @attrs = undasherize_keys( _attrs )
157      @children = []
158      @text = false
159    end
160
161    def attrs
162      @attrs ||= {}
163    end
164
165
166    def add_node( attr )
167      @text = true if attr.kind_of?( String )
168      @children << attr
169    end
170
171    def text?
172      @text
173    end
174
175    def to_hash
176      if @text
177        return { name => typecast_value( translate_xml_entities( inner_html ) ) }
178       else
179        #change repeating groups into an array
180        # group by the first key of each element of the array to find repeating groups
181        
182        grps = @children.group_by{ |c| c.name }
183
184        out = {} 
185        grps.each do |k, v|
186          if v.size == 1
187            out.merge!( v.first )
188          else
189            out.merge!( k => v.map{ |elem| elem.to_hash[k] } )
190          end
191        end
192
193        # merge the arrays, including attributes
194        out.merge!( attrs ) unless attrs.empty?
195       return { name => out }
196      end
197    end
198
199    def to_s
200      self.to_html
201    end
202
203
204    def typecast_value( value )
205      return value unless attrs["type"]
206      
207      case attrs["type"]
208        when "integer"  then value.to_i
209        when "boolean"  then value.strip == "true"
210        when "datetime" then ::Time.parse(value).utc
211        when "date"     then ::Date.parse(value)
212        else                 value
213      end
214    end
215
216    def translate_xml_entities(value)
217      value.gsub(/&lt;/,   "<").
218            gsub(/&gt;/,   ">").
219            gsub(/&quot;/, '"').
220            gsub(/&apos;/, "'").
221            gsub(/&amp;/,  "&")
222    end
223
224     def undasherize_keys(params)
225       params.keys.each do |k, v|
226         params[k.tr("-", "_")] =  params.delete(k)
227       end
228       params
229    end
230
231    def inner_html
232      @children.join
233    end
234
235    def to_html
236      "<#{name}#{attributes_to_xml}>#{inner_html}</#{name}>"
237    end
238
239    def attributes_to_xml
240      attrs.keys.map do |k|
241        "#{k}='#{attrs[k]}'"
242      end.join
243    end
244 end
245
246
247  class Hash
248
249    class << self
250        # Converts valid XML into a Ruby Hash structure.
251        # <tt>xml</tt>:: A string representation of valid XML
252        #
253        # == Typecasting
254        # Typecasting is performed on elements that have a "<tt>type</tt>" attribute of
255        # <tt>integer</tt>::
256        # <tt>boolean</tt>:: anything other than "true" evaluates to false
257        # <tt>datetime</tt>:: Returns a Time object.  See +Time+ documentation for valid Time strings
258        # <tt>date</tt>:: Returns a Date object.  See +Date+ documentation for valid Date strings
259        #
260        # Keys are automatically converted to +snake_case+
261        #
262        # == Caveats
263        # * Mixed content tags are assumed to be text and any xml tags are kept as a String
264        # * Any attributes other than type on a node containing a text node will be discarded
265        #
266        # == Examples
267        #
268        # ===Standard
269        # <user gender='m'>
270        #   <age type='integer'>35</age>
271        #   <name>Homer Simpson</name>
272        #   <dob type='date'>1988-01-01</dob>
273        #   <joined-at type='datetime'>2000-04-28 23:01</joined-at>
274        #   <is-cool type='boolean'>true</is-cool>
275        # </user>
276        #
277        # evaluates to
278        #
279        # { "user" =>
280        #         { "gender"    => "m",
281        #           "age"       => 35,
282        #           "name"      => "Homer Simpson",
283        #           "dob"       => DateObject( 1998-01-01 ),
284        #           "joined_at" => TimeObject( 2000-04-28 23:01),
285        #           "is_cool"   => true
286        #         }
287        #     }
288        #
289        # === Mixed Content
290        # <story>
291        #   A Quick <em>brown</em> Fox
292        # </story>
293        #
294        # evaluates to
295        # { "story" => "A Quick <em>brown</em> Fox" }
296        #
297        # === Attributes other than type on a node containing text
298        # <story is-good='fasle'>
299        #   A Quick <em>brown</em> Fox
300        # </story>
301        #
302        # evaluates to
303        # { "story" => "A Quick <em>brown</em> Fox" }
304        #
305        # <bicep unit='inches' type='integer'>60</bicep>
306        #
307        # evaluates with a typecast to an integer.  But ignores the unit attribute
308        # { "bicep" => 60 }
309
310        def from_xml( xml )
311          undasherize_keys(  Hpricot::XML( xml ).root.to_hash  )   
312        end
313
314      private
315
316        def undasherize_keys(params)
317          case params.class.to_s
318          when "Hash"
319            params.inject({}) do |h,(k,v)|
320              h[k.to_s.tr("-", "_")] = undasherize_keys(v)
321              h
322            end
323          when "Array"
324            params.map { |v| undasherize_keys(v) }
325          else
326            params
327          end
328       end
329    end
330
331    def to_params
332      result = ''
333      stack = []
334
335      each do |key, value|
336        Hash === value ? stack << [key, value] : result <<  "#{key}=#{value}&"
337      end
338
339      stack.each do |parent, hash|
340        hash.each do |key, value|
341          if Hash === value
342            stack << ["#{parent}[#{key}]", value]
343          else
344            result << "#{parent}[#{key}]=#{value}&"
345          end
346        end
347      end
348      result.chop
349    end
350
351    # lets through the keys in the argument
352    # >> {:one => 1, :two => 2, :three => 3}.pass(:one)
353    # => {:one=>1}
354    def pass(*allowed)
355      self.reject { |k,v| ! allowed.include?(k) }
356    end
357    alias only pass
358
359    # blocks the keys in the arguments
360    # >> {:one => 1, :two => 2, :three => 3}.block(:one)
361    # => {:two=>2, :three=>3}
362    def block(*rejected)
363      self.reject { |k,v| rejected.include?(k) }
364    end
365    alias except block
366
367
368    # Destructively convert all keys to symbols recursively.
369    def symbolize_keys!
370      keys.each do |key|
371        unless key.is_a?(Symbol)
372          self[key.to_sym] = self[key]
373          delete(key)
374        end
375        if Hash === (sub = self[key.to_sym])
376          sub.symbolize_keys!
377        end 
378      end
379      self
380    end
381
382   def method_missing(m,*a)
383     m.to_s =~ /=$/ ? self[$`]=a[0] : a==[] ? self[m] : raise(NoMethodError,"#{m}")
384   end
385
386    def respond_to?(method, include_private=false)
387      return true if keys.include?(method)
388      super(method, include_private)
389    end
390
391  end 
392
393
394  class Hpricot::Elem 
395    # Converts this Hpricto::Elem to a hash. 
396    # If the element contains mixed content.  i.e. tags and text, the innerHTML will
397    # be used.
398    def to_hash
399      result = {}
400      # Get any attributes of the tag out and into the hash
401      if !attributes.nil?
402        result[name].merge!( attributes ) unless result[name].nil?
403        result[name] = attributes if result[name].nil?
404      end
405
406
407      if children.detect{ |a| a.text? && a.to_s.strip.length != 0 }
408        # There is text nodes in here.
409        # Just return the innerHTML of the node, typecasted if required
410        content = translate_xml_entities( inner_html.strip )
411        result[name] = case attributes["type"]
412
413          when "integer"  then content.to_i
414          when "boolean"  then content.strip == "true"
415          when "datetime" then ::Time.parse(content).utc
416          when "date"     then ::Date.parse(content)
417          else                 content
418        end
419
420      else     
421        # Get the children in on the action
422
423        child_array = children.map do |child|
424          child_hash = {}
425          if !child.text?
426            child_hash = child.to_hash
427          end
428          # If there was no hash found for this child then make it nil so the compact will take it out
429          child_hash.empty? ? nil : child_hash
430        # We're grouping by the first key to see if this element has any duplicate child tag names
431        end.compact.group_by{ |child| child.keys.first }
432
433        # If any hashes in the child_array have a duplicate tag name collect these into an array
434        child_array.keys.each do |a|
435          if child_array[a].size == 1
436            # There is only one of these elements
437            # Grab it amd put it into the result
438            result[name].merge!( child_array[a].first )
439          else
440            # There are more than one of these elements
441            # They need to be put into an array
442            result[name].merge!( { a => child_array[a].map{ |child| child[child.keys.first] }} )
443          end
444        end
445
446      end
447      result
448    end
449
450
451    private
452    def translate_xml_entities(value)
453      value.gsub(/&lt;/,   "<").
454            gsub(/&gt;/,   ">").
455            gsub(/&quot;/, '"').
456            gsub(/&apos;/, "'").
457            gsub(/&amp;/,  "&")
458    end
459  end
460
461
462 class Array
463   def mean
464     Float(self.inject {|sum, elem| sum += elem }) / Float(self.length)
465   end
466
467   def variance
468     m = self.mean
469           sum2 = self.inject(0) {|variance, elem| variance += (elem - m) ** 2}
470     Float(sum2 / (self.size-1))
471   end
472
473   def stdev
474     Math.sqrt(variance)
475   end
476 end
477
478
479
480 n = 1_000
481
482 xml_feeds = []
483 xml_feeds << "http://feeds.feedburner.com/37signals/beMH" # Hpricot is faster for this feed
484 #xml_feeds << "http://feeds.gawker.com/gizmodo/excerpts.xml"
485 xml_feeds << "http://brainspl.at/xml/rss20/feed.xml"
486 xml_feeds << "http://jobs.37signals.com/categories/2/jobs;rss"
487 xml_feeds << "http://www.aftenposten.no/eksport/rss-1_0/?seksjon=ece_frontpage"
488 xml_feeds << "http://merb.devjavu.com/projects/merb/report/9?format=rss"
489 xml_feeds << "http://odeo.com/profile/OdeoMusic/rss"
490
491 xml_feeds.each do |xml_feed|
492   @xml = open( xml_feed ).read
493  
494   measures = Array.new
495
496   n.times do
497     measures << Benchmark.measure('libxml')  { XMLUtilityNode.hash_from_xml(@xml) }
498     measures << Benchmark.measure('hpricot') { Hash.from_xml(@xml) }
499     measures << Benchmark.measure('rexml')   { ToHashParser.from_xml(@xml) }
500   end
501  
502   puts "\n\n-- #{xml_feed} (n=#{n}) --"
503   puts "           mean     95% interval"
504   labels = measures.collect{|m| m.label}.uniq.sort
505   labels.each do |label|
506     times = measures.select{|m| m.label == label}.collect{|m| m.total}
507     m = times.mean
508     conf_low  = times.mean-1.96*times.stdev
509     conf_high = times.mean+1.96*times.stdev
510     puts "#{label.ljust(10)} #{sprintf("%.2f", m)}   " <<
511          "(#{sprintf("%.2f", conf_low).rjust(5)} .. #{sprintf("%.2f", conf_high)} )"
512   end
513
514 end
515