def tokenize_html( str )
depth = 0
tokens = []
@scanner.string = str.dup
type, token = nil, nil
until @scanner.empty?
@log.debug "Scanning from %p" % @scanner.rest
if (( token = @scanner.scan(MetaTag) ))
type = :tag
elsif (( token = @scanner.scan(HTMLTagOpenRegexp) ))
tagstart = @scanner.pos
@log.debug " Found the start of a plain tag at %d" % tagstart
depth = 1
type = :tag
while depth.nonzero?
chunk = @scanner.scan( HTMLTagPart ) or
raise "Malformed tag at character %d: %p" %
[ tagstart, token + @scanner.rest ]
@log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ]
token += chunk
depth += ( token[-1, 1] == '>' ? -1 : 1 )
@log.debug " Depth is now #{depth}"
end
else
@log.debug " Looking for a chunk of text"
type = :text
token = @scanner.scan_until( /[^<]+/m )
end
@log.debug " type: %p, token: %p" % [ type, token ]
if block_given?
yield( type, token )
end
tokens << [ type, token ]
end
return tokens
end