# File lib/scraper/base.rb, line 747 747: def scrape() 748: # Call prepare with the document, but before doing anything else. 749: prepare document 750: # Retrieve the document. This may raise HTTPError or HTMLParseError. 751: case document 752: when Array 753: stack = @document.reverse # see below 754: when HTML::Node 755: # If a root element is specified, start selecting from there. 756: # The stack is empty if we can't find any root element (makes 757: # sense). However, the node we're going to process may be 758: # a tag, or an HTML::Document.root which is the equivalent of 759: # a document fragment. 760: root_element = option(:root_element) 761: root = root_element ? @document.find(:tag=>root_element) : @document 762: stack = root ? (root.tag? ? [root] : root.children.reverse) : [] 763: else 764: return 765: end 766: # @skip stores all the elements we want to skip (see #skip). 767: # rules stores all the rules we want to process with this 768: # scraper, based on the class definition. 769: @skip = [] 770: @stop = false 771: rules = self.class.rules.clone 772: begin 773: # Process the document one node at a time. We process elements 774: # from the end of the stack, so each time we visit child elements, 775: # we add them to the end of the stack in reverse order. 776: while node = stack.pop 777: break if @stop 778: skip_this = false 779: # Only match nodes that are elements, ignore text nodes. 780: # Also ignore any element that's on the skip list, and if 781: # found one, remove it from the list (since we never visit 782: # the same element twice). But an element may be added twice 783: # to the skip list. 784: # Note: equal? is faster than == for nodes. 785: next unless node.tag? 786: @skip.delete_if { |s| skip_this = true if s.equal?(node) } 787: next if skip_this 788: 789: # Run through all the rules until we process the element or 790: # run out of rules. If skip_this=true then we processed the 791: # element and we can break out of the loop. However, we might 792: # process (and skip) descedants so also watch the skip list. 793: rules.delete_if do |selector, extractor, rule_name, first_only| 794: break if skip_this 795: # The result of calling match (selected) is nil, element 796: # or array of elements. We turn it into an array to 797: # process one element at a time. We process all elements 798: # that are not on the skip list (we haven't visited 799: # them yet). 800: if selected = selector.match(node, first_only) 801: selected = [selected] unless selected.is_a?(Array) 802: selected = [selected.first] if first_only 803: selected.each do |element| 804: # Do not process elements we already skipped 805: # (see above). However, this time we may visit 806: # an element twice, since selected elements may 807: # be descendants of the current element on the 808: # stack. In rare cases two elements on the stack 809: # may pick the same descendants. 810: next if @skip.find { |s| s.equal?(element) } 811: # Call the extractor method with this element. 812: # If it returns true, skip the element and if 813: # the current element, don't process any more 814: # rules. Again, pay attention to descendants. 815: if extractor.bind(self).call(element) 816: @extracted = true 817: end 818: if @skip.delete(true) 819: if element.equal?(node) 820: skip_this = true 821: else 822: @skip << element 823: end 824: end 825: end 826: first_only if !selected.empty? 827: end 828: end 829: 830: # If we did not skip the element, we're going to process its 831: # children. Reverse order since we're popping from the stack. 832: if !skip_this && children = node.children 833: stack.concat children.reverse 834: end 835: end 836: ensure 837: @skip = nil 838: end 839: collect 840: return result 841: end