| Class | HTML::SGMLParser |
| In: |
lib/html/htmlparser.rb
|
| Parent: | Object |
A parser for SGML, using the derived class as static DTD.
| Interesting | = | /[&<]/ | Regular expressions used for parsing: | |
| Incomplete | = | Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' + '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' + '![^<>]*)?') | ||
| Entityref | = | /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/ | ||
| Charref | = | /&#([0-9]+)[^0-9]/ | ||
| Starttagopen | = | /<[>a-zA-Z]/ | ||
| Endtagopen | = | /<\/[<>a-zA-Z]/ | ||
| Endbracket | = | /<|>|\/>/ | Assaf: fixed to allow tag to close itself (XHTML) | |
| Special | = | /<![^<>]*>/ | ||
| Commentopen | = | /<!--/ | ||
| Commentclose | = | /--[ \t\n]*>/ | ||
| Tagfind | = | /[a-zA-Z][a-zA-Z0-9.-]*/ | ||
| Attrfind | = | Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' + '(\s*=\s*' + "('[^']*'" + '|"[^"]*"' + '|[-~a-zA-Z0-9,.:+*%?!()_#=]*))?') | Assaf: / is no longer part of allowed attribute value | |
| Entitydefs | = | {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''} |
# File lib/html/htmlparser.rb, line 34
34: def initialize(verbose=false)
35: @verbose = verbose
36: reset
37: end
# File lib/html/htmlparser.rb, line 60
60: def feed(data)
61: @rawdata << data
62: goahead(false)
63: end
# File lib/html/htmlparser.rb, line 255
255: def finish_endtag(tag)
256: if tag == ''
257: found = @stack.length - 1
258: if found < 0
259: unknown_endtag(tag)
260: return
261: end
262: else
263: unless @stack.include? tag
264: method = 'end_' + tag
265: unless self.respond_to?(method)
266: unknown_endtag(tag)
267: end
268: return
269: end
270: found = @stack.index(tag) #or @stack.length
271: end
272: while @stack.length > found
273: tag = @stack[-1]
274: method = 'end_' + tag
275: if respond_to?(method)
276: handle_endtag(tag, method)
277: else
278: unknown_endtag(tag)
279: end
280: @stack.pop
281: end
282: end
# File lib/html/htmlparser.rb, line 237
237: def finish_starttag(tag, attrs)
238: method = 'start_' + tag
239: if self.respond_to?(method)
240: @stack << tag
241: handle_starttag(tag, method, attrs)
242: return 1
243: else
244: method = 'do_' + tag
245: if self.respond_to?(method)
246: handle_starttag(tag, method, attrs)
247: return 0
248: else
249: unknown_starttag(tag, attrs)
250: return -1
251: end
252: end
253: end
# File lib/html/htmlparser.rb, line 69
69: def goahead(_end)
70: rawdata = @rawdata
71: i = 0
72: n = rawdata.length
73: while i < n
74: if @nomoretags
75: handle_data(rawdata[i..(n-1)])
76: i = n
77: break
78: end
79: j = rawdata.index(Interesting, i)
80: j = n unless j
81: if i < j
82: handle_data(rawdata[i..(j-1)])
83: end
84: i = j
85: break if (i == n)
86: if rawdata[i] == ?< #
87: if rawdata.index(Starttagopen, i) == i
88: if @literal
89: handle_data(rawdata[i, 1])
90: i += 1
91: next
92: end
93: k = parse_starttag(i)
94: break unless k
95: i = k
96: next
97: end
98: if rawdata.index(Endtagopen, i) == i
99: k = parse_endtag(i)
100: break unless k
101: i = k
102: @literal = false
103: next
104: end
105: if rawdata.index(Commentopen, i) == i
106: if @literal
107: handle_data(rawdata[i,1])
108: i += 1
109: next
110: end
111: k = parse_comment(i)
112: break unless k
113: i += k
114: next
115: end
116: if rawdata.index(Special, i) == i
117: if @literal
118: handle_data(rawdata[i, 1])
119: i += 1
120: next
121: end
122: k = parse_special(i)
123: break unless k
124: i += k
125: next
126: end
127: elsif rawdata[i] == ?& #
128: if rawdata.index(Charref, i) == i
129: i += $&.length
130: handle_charref($1)
131: i -= 1 unless rawdata[i-1] == ?;
132: next
133: end
134: if rawdata.index(Entityref, i) == i
135: i += $&.length
136: handle_entityref($1)
137: i -= 1 unless rawdata[i-1] == ?;
138: next
139: end
140: else
141: raise RuntimeError, 'neither < nor & ??'
142: end
143: # We get here only if incomplete matches but
144: # nothing else
145: match = rawdata.index(Incomplete, i)
146: unless match == i
147: handle_data(rawdata[i, 1])
148: i += 1
149: next
150: end
151: j = match + $&.length
152: break if j == n # Really incomplete
153: handle_data(rawdata[i..(j-1)])
154: i = j
155: end
156: # end while
157: if _end and i < n
158: handle_data(@rawdata[i..(n-1)])
159: i = n
160: end
161: @rawdata = rawdata[i..-1]
162: end
# File lib/html/htmlparser.rb, line 308
308: def handle_charref(name)
309: n = Integer(name) rescue -1
310: if !(0 <= n && n <= 255)
311: unknown_charref(name)
312: return
313: end
314: handle_data(n.chr)
315: end
# File lib/html/htmlparser.rb, line 297
297: def handle_endtag(tag, method)
298: self.send(method)
299: end
# File lib/html/htmlparser.rb, line 317
317: def handle_entityref(name)
318: table = Entitydefs
319: if table.include?(name)
320: handle_data(table[name])
321: else
322: unknown_entityref(name)
323: return
324: end
325: end
# File lib/html/htmlparser.rb, line 293
293: def handle_starttag(tag, method, attrs)
294: self.send(method, attrs)
295: end
# File lib/html/htmlparser.rb, line 164
164: def parse_comment(i)
165: rawdata = @rawdata
166: if rawdata[i, 4] != '<!--'
167: raise RuntimeError, 'unexpected call to handle_comment'
168: end
169: match = rawdata.index(Commentclose, i)
170: return nil unless match
171: matched_length = $&.length
172: j = match
173: handle_comment(rawdata[i+4..(j-1)])
174: j = match + matched_length
175: return j-i
176: end
# File lib/html/htmlparser.rb, line 225
225: def parse_endtag(i)
226: rawdata = @rawdata
227: j = rawdata.index(Endbracket, i + 1)
228: return nil unless j
229: tag = (rawdata[i+2..j-1].strip).downcase
230: if rawdata[j] == ?> #
231: j += 1
232: end
233: finish_endtag(tag)
234: return j
235: end
# File lib/html/htmlparser.rb, line 284
284: def parse_special(i)
285: rawdata = @rawdata
286: match = rawdata.index(Endbracket, i+1)
287: return nil unless match
288: matched_length = $&.length
289: handle_special(rawdata[i+1..(match-1)])
290: return match - i + matched_length
291: end
# File lib/html/htmlparser.rb, line 178
178: def parse_starttag(i)
179: rawdata = @rawdata
180: j = rawdata.index(Endbracket, i + 1)
181: return nil unless j
182: attrs = []
183: if rawdata[i+1] == ?> #
184: # SGML shorthand: <> == <last open tag seen>
185: k = j
186: tag = @lasttag
187: else
188: match = rawdata.index(Tagfind, i + 1)
189: unless match
190: raise RuntimeError, 'unexpected call to parse_starttag'
191: end
192: k = i + 1 + ($&.length)
193: tag = $&.downcase
194: @lasttag = tag
195: end
196: while k < j
197: # Assaf: fixed to allow tag to close itself (XHTML)
198: break unless idx = rawdata.index(Attrfind, k) and idx < j
199: matched_length = $&.length
200: attrname, rest, attrvalue = $1, $2, $3
201: if not rest
202: attrvalue = '' # was: = attrname
203: # Assaf: fixed to handle double quoted attribute values properly
204: elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
205: (attrvalue[0] == ?" && attrvalue[-1] == ?")
206: attrvalue = attrvalue[1..-2]
207: end
208: attrs << [attrname.downcase, attrvalue]
209: k += matched_length
210: end
211: # Assaf: fixed to allow tag to close itself (XHTML)
212: if rawdata[j,2] == '/>'
213: j += 2
214: finish_starttag(tag, attrs)
215: finish_endtag(tag)
216: else
217: if rawdata[j] == ?> #
218: j += 1
219: end
220: finish_starttag(tag, attrs)
221: end
222: return j
223: end
# File lib/html/htmlparser.rb, line 301
301: def report_unbalanced(tag)
302: if @verbose
303: print '*** Unbalanced </' + tag + '>', "\n"
304: print '*** Stack:', self.stack, "\n"
305: end
306: end
# File lib/html/htmlparser.rb, line 39
39: def reset
40: @rawdata = ''
41: @stack = []
42: @lasttag = '???'
43: @nomoretags = false
44: @literal = false
45: end
# File lib/html/htmlparser.rb, line 51
51: def setnomoretags
52: @nomoretags = true
53: @literal = true
54: end