Class HTML::SGMLParser
In: lib/html/htmlparser.rb
Parent: Object

A parser for SGML, using the derived class as static DTD.

Methods

Constants

Interesting = /[&<]/   Regular expressions used for parsing:
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + '![^<>]*)?')
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/
Charref = /&#([0-9]+)[^0-9]/
Starttagopen = /<[>a-zA-Z]/
Endtagopen = /<\/[<>a-zA-Z]/
Endbracket = /<|>|\/>/   Assaf: fixed to allow tag to close itself (XHTML)
Special = /<![^<>]*>/
Commentopen = /<!--/
Commentclose = /--[ \t\n]*>/
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + '(\s*=\s*' + "('[^']*'" + '|"[^"]*"' + '|[-~a-zA-Z0-9,.:+*%?!()_#=]*))?')   Assaf: / is no longer part of allowed attribute value
Entitydefs = {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

Public Class methods

[Source]

    # File lib/html/htmlparser.rb, line 34
34:     def initialize(verbose=false)
35:         @verbose = verbose
36:         reset
37:     end

Public Instance methods

[Source]

    # File lib/html/htmlparser.rb, line 65
65:     def close
66:         goahead(true)
67:     end

[Source]

    # File lib/html/htmlparser.rb, line 60
60:     def feed(data)
61:         @rawdata << data
62:         goahead(false)
63:     end

[Source]

     # File lib/html/htmlparser.rb, line 255
255:     def finish_endtag(tag)
256:         if tag == ''
257:         found = @stack.length - 1
258:         if found < 0
259:             unknown_endtag(tag)
260:             return
261:         end
262:         else
263:         unless @stack.include? tag
264:             method = 'end_' + tag
265:             unless self.respond_to?(method)
266:             unknown_endtag(tag)
267:             end
268:             return
269:         end
270:         found = @stack.index(tag) #or @stack.length

271:         end
272:         while @stack.length > found
273:         tag = @stack[-1]
274:         method = 'end_' + tag
275:         if respond_to?(method)
276:             handle_endtag(tag, method)
277:         else
278:             unknown_endtag(tag)
279:         end
280:         @stack.pop
281:         end
282:     end

[Source]

     # File lib/html/htmlparser.rb, line 237
237:     def finish_starttag(tag, attrs)
238:         method = 'start_' + tag
239:         if self.respond_to?(method)
240:         @stack << tag
241:         handle_starttag(tag, method, attrs)
242:         return 1
243:         else
244:         method = 'do_' + tag
245:         if self.respond_to?(method)
246:             handle_starttag(tag, method, attrs)
247:             return 0
248:         else
249:             unknown_starttag(tag, attrs)
250:             return -1
251:         end
252:         end
253:     end

[Source]

     # File lib/html/htmlparser.rb, line 69
 69:     def goahead(_end)
 70:         rawdata = @rawdata
 71:         i = 0
 72:         n = rawdata.length
 73:         while i < n
 74:         if @nomoretags
 75:             handle_data(rawdata[i..(n-1)])
 76:             i = n
 77:             break
 78:         end
 79:         j = rawdata.index(Interesting, i)
 80:         j = n unless j
 81:         if i < j
 82:             handle_data(rawdata[i..(j-1)])
 83:         end
 84:         i = j
 85:         break if (i == n)
 86:         if rawdata[i] == ?< #

 87:             if rawdata.index(Starttagopen, i) == i
 88:             if @literal
 89:                 handle_data(rawdata[i, 1])
 90:                 i += 1
 91:                 next
 92:             end
 93:             k = parse_starttag(i)
 94:             break unless k
 95:             i = k
 96:             next
 97:             end
 98:             if rawdata.index(Endtagopen, i) == i
 99:             k = parse_endtag(i)
100:             break unless k
101:             i = k
102:             @literal = false
103:             next
104:             end
105:             if rawdata.index(Commentopen, i) == i
106:             if @literal
107:                 handle_data(rawdata[i,1])
108:                 i += 1
109:                 next
110:             end
111:             k = parse_comment(i)
112:             break unless k
113:             i += k
114:             next
115:             end
116:             if rawdata.index(Special, i) == i
117:             if @literal
118:                 handle_data(rawdata[i, 1])
119:                 i += 1
120:                 next
121:             end
122:             k = parse_special(i)
123:             break unless k
124:             i += k
125:             next
126:             end
127:         elsif rawdata[i] == ?& #

128:             if rawdata.index(Charref, i) == i
129:             i += $&.length
130:             handle_charref($1)
131:             i -= 1 unless rawdata[i-1] == ?;
132:             next
133:             end
134:             if rawdata.index(Entityref, i) == i
135:             i += $&.length
136:             handle_entityref($1)
137:             i -= 1 unless rawdata[i-1] == ?;
138:             next
139:             end
140:         else
141:             raise RuntimeError, 'neither < nor & ??'
142:         end
143:         # We get here only if incomplete matches but

144:         # nothing else

145:         match = rawdata.index(Incomplete, i)
146:         unless match == i
147:             handle_data(rawdata[i, 1])
148:             i += 1
149:             next
150:         end
151:         j = match + $&.length
152:         break if j == n # Really incomplete

153:         handle_data(rawdata[i..(j-1)])
154:         i = j
155:         end
156:         # end while

157:         if _end and i < n
158:         handle_data(@rawdata[i..(n-1)])
159:         i = n
160:         end
161:         @rawdata = rawdata[i..-1]
162:     end

[Source]

     # File lib/html/htmlparser.rb, line 308
308:     def handle_charref(name)
309:         n = Integer(name) rescue -1
310:         if !(0 <= n && n <= 255)
311:         unknown_charref(name)
312:         return
313:         end
314:         handle_data(n.chr)
315:     end

[Source]

     # File lib/html/htmlparser.rb, line 330
330:     def handle_comment(data)
331:     end

[Source]

     # File lib/html/htmlparser.rb, line 327
327:     def handle_data(data)
328:     end

[Source]

     # File lib/html/htmlparser.rb, line 297
297:     def handle_endtag(tag, method)
298:         self.send(method)
299:     end

[Source]

     # File lib/html/htmlparser.rb, line 317
317:     def handle_entityref(name)
318:         table = Entitydefs
319:         if table.include?(name)
320:         handle_data(table[name])
321:         else
322:         unknown_entityref(name)
323:         return
324:         end
325:     end

[Source]

     # File lib/html/htmlparser.rb, line 333
333:     def handle_special(data)
334:     end

[Source]

     # File lib/html/htmlparser.rb, line 293
293:     def handle_starttag(tag, method, attrs)
294:         self.send(method, attrs)
295:     end

[Source]

    # File lib/html/htmlparser.rb, line 47
47:     def has_context(gi)
48:         @stack.include? gi
49:     end

[Source]

     # File lib/html/htmlparser.rb, line 164
164:     def parse_comment(i)
165:         rawdata = @rawdata
166:         if rawdata[i, 4] != '<!--'
167:         raise RuntimeError, 'unexpected call to handle_comment'
168:         end
169:         match = rawdata.index(Commentclose, i)
170:         return nil unless match
171:         matched_length = $&.length
172:         j = match
173:         handle_comment(rawdata[i+4..(j-1)])
174:         j = match + matched_length
175:         return j-i
176:     end

[Source]

     # File lib/html/htmlparser.rb, line 225
225:     def parse_endtag(i)
226:         rawdata = @rawdata
227:         j = rawdata.index(Endbracket, i + 1)
228:         return nil unless j
229:         tag = (rawdata[i+2..j-1].strip).downcase
230:         if rawdata[j] == ?> #

231:         j += 1
232:         end
233:         finish_endtag(tag)
234:         return j
235:     end

[Source]

     # File lib/html/htmlparser.rb, line 284
284:     def parse_special(i)
285:         rawdata = @rawdata
286:         match = rawdata.index(Endbracket, i+1)
287:         return nil unless match
288:         matched_length = $&.length
289:         handle_special(rawdata[i+1..(match-1)])
290:         return match - i + matched_length
291:     end

[Source]

     # File lib/html/htmlparser.rb, line 178
178:     def parse_starttag(i)
179:         rawdata = @rawdata
180:         j = rawdata.index(Endbracket, i + 1)
181:         return nil unless j
182:         attrs = []
183:         if rawdata[i+1] == ?> #

184:         # SGML shorthand: <> == <last open tag seen>

185:         k = j
186:         tag = @lasttag
187:         else
188:         match = rawdata.index(Tagfind, i + 1)
189:         unless match
190:             raise RuntimeError, 'unexpected call to parse_starttag'
191:         end
192:         k = i + 1 + ($&.length)
193:         tag = $&.downcase
194:         @lasttag = tag
195:         end
196:         while k < j
197:         # Assaf: fixed to allow tag to close itself (XHTML)

198:         break unless idx = rawdata.index(Attrfind, k) and idx < j
199:         matched_length = $&.length
200:         attrname, rest, attrvalue = $1, $2, $3
201:         if not rest
202:             attrvalue = '' # was: = attrname

203:         # Assaf: fixed to handle double quoted attribute values properly

204:         elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
205:             (attrvalue[0] == ?" && attrvalue[-1] == ?")
206:             attrvalue = attrvalue[1..-2]
207:         end
208:         attrs << [attrname.downcase, attrvalue]
209:         k += matched_length
210:         end
211:         # Assaf: fixed to allow tag to close itself (XHTML)

212:         if rawdata[j,2] == '/>'
213:         j += 2
214:         finish_starttag(tag, attrs)
215:         finish_endtag(tag)
216:         else
217:         if rawdata[j] == ?> #

218:             j += 1
219:         end
220:         finish_starttag(tag, attrs)
221:         end
222:         return j
223:     end

[Source]

     # File lib/html/htmlparser.rb, line 301
301:     def report_unbalanced(tag)
302:         if @verbose
303:         print '*** Unbalanced </' + tag + '>', "\n"
304:         print '*** Stack:', self.stack, "\n"
305:         end
306:     end

[Source]

    # File lib/html/htmlparser.rb, line 39
39:     def reset
40:         @rawdata = ''
41:         @stack = []
42:         @lasttag = '???'
43:         @nomoretags = false
44:         @literal = false
45:     end

[Source]

    # File lib/html/htmlparser.rb, line 56
56:     def setliteral(*args)
57:         @literal = true
58:     end

[Source]

    # File lib/html/htmlparser.rb, line 51
51:     def setnomoretags
52:         @nomoretags = true
53:         @literal = true
54:     end

[Source]

     # File lib/html/htmlparser.rb, line 340
340:     def unknown_charref(ref)
341:     end

[Source]

     # File lib/html/htmlparser.rb, line 338
338:     def unknown_endtag(tag)
339:     end

[Source]

     # File lib/html/htmlparser.rb, line 342
342:     def unknown_entityref(ref)
343:     end

[Source]

     # File lib/html/htmlparser.rb, line 336
336:     def unknown_starttag(tag, attrs)
337:     end

[Validate]