Class HTML::HTMLParser
In: lib/html/htmlparser.rb
Parent: SGMLParser

(X)HTML parser.

Parses a String and returns an REXML::Document with the (X)HTML content.

For example:

  html = "<p>paragraph</p>"
  parser = HTMLParser.new(html)
  puts parser.document

Requires a patched version of SGMLParser.

Methods

Attributes

document  [R] 

Public Class methods

[Source]

     # File lib/html/htmlparser.rb, line 368
368:         def initialize()
369:             super
370:             @document = HTML::Document.new("")
371:             @current = @document.root
372:         end

[Source]

     # File lib/html/htmlparser.rb, line 362
362:         def self.parse(html)
363:             parser = HTMLParser.new
364:             parser.feed(html)
365:             parser.document
366:         end

Public Instance methods

[Source]

     # File lib/html/htmlparser.rb, line 378
378:         def handle_comment(data)
379:         end

[Source]

     # File lib/html/htmlparser.rb, line 374
374:         def handle_data(data)
375:             @current.children << HTML::Text.new(@current, 0, 0, data)
376:         end

[Source]

     # File lib/html/htmlparser.rb, line 381
381:         def handle_special(data)
382:         end

[Source]

     # File lib/html/htmlparser.rb, line 398
398:         def unknown_charref(ref)
399:         end

[Source]

     # File lib/html/htmlparser.rb, line 394
394:         def unknown_endtag(tag)
395:             @current = @current.parent if @current.parent
396:         end

[Source]

     # File lib/html/htmlparser.rb, line 401
401:         def unknown_entityref(ref)
402:             @current.children << HTML::Text.new(@current, 0, 0, "&amp;#{ref}&lt;")
403:         end

[Source]

     # File lib/html/htmlparser.rb, line 384
384:         def unknown_starttag(tag, attrs)
385:             attrs = attrs.inject({}) do |hash, attr|
386:                 hash[attr[0].downcase] = attr[1]
387:                 hash
388:             end
389:             element = HTML::Tag.new(@current || @document, 0, 0, tag.downcase, attrs, true)
390:             @current.children << element
391:             @current = element
392:         end

[Validate]