| Class | HTML::HTMLParser |
| In: |
lib/html/htmlparser.rb
|
| Parent: | SGMLParser |
(X)HTML parser.
Parses a String and returns an REXML::Document with the (X)HTML content.
For example:
html = "<p>paragraph</p>" parser = HTMLParser.new(html) puts parser.document
Requires a patched version of SGMLParser.
| document | [R] |
# File lib/html/htmlparser.rb, line 368
368: def initialize()
369: super
370: @document = HTML::Document.new("")
371: @current = @document.root
372: end
# File lib/html/htmlparser.rb, line 362
362: def self.parse(html)
363: parser = HTMLParser.new
364: parser.feed(html)
365: parser.document
366: end
# File lib/html/htmlparser.rb, line 374
374: def handle_data(data)
375: @current.children << HTML::Text.new(@current, 0, 0, data)
376: end
# File lib/html/htmlparser.rb, line 394
394: def unknown_endtag(tag)
395: @current = @current.parent if @current.parent
396: end
# File lib/html/htmlparser.rb, line 401
401: def unknown_entityref(ref)
402: @current.children << HTML::Text.new(@current, 0, 0, "&#{ref}<")
403: end
# File lib/html/htmlparser.rb, line 384
384: def unknown_starttag(tag, attrs)
385: attrs = attrs.inject({}) do |hash, attr|
386: hash[attr[0].downcase] = attr[1]
387: hash
388: end
389: element = HTML::Tag.new(@current || @document, 0, 0, tag.downcase, attrs, true)
390: @current.children << element
391: @current = element
392: end