This page contains automated test results for code from O'Reilly's Ruby Cookbook. If this code looks interesting or useful, you might want to buy the whole book.
| Parsing Invalid Markup | ||
|---|---|---|
| Code | Expected | Actual |
require 'rubygems' require 'rubyful_soup' invalid_html = 'A lot of <b class=1>tags are <i class=2>never closed.' soup = BeautifulSoup.new(invalid_html) puts soup.prettify |
A lot of <b class="1">tags are <i class="2">never closed. </i> </b> |
A lot of <b class="1">tags are <i class="2">never closed. </i> </b> |
soup.b.i |
<i class="2">never closed.</i> | <i class="2">never closed.</i> |
soup.i |
<i class="2">never closed.</i> | <i class="2">never closed.</i> |
soup.find(nil, :attrs=>{'class' => '2'}) |
<i class="2">never closed.</i> | <i class="2">never closed.</i> |
soup.find_all('i') |
[<i class="2">never closed.</i>] | [<i class="2">never closed.</i>] |
soup.b['class'] |
"1" | "1" |
soup.find_text(/closed/) |
"never closed." | "never closed." |
require 'rubygems'
require 'html/sgml-parser'
require 'set'
html = %{<a name="anchor"><a href="http://www.oreilly.com">O'Reilly</a>
<b>irrelevant</b><a href="http://www.ruby-lang.org/">Ruby</a>}
class LinkGrabber < HTML::SGMLParser
attr_reader :urls
def initialize
@urls = Set.new
super
end
def do_a(attrs)
url = attrs.find { |attr| attr[0] == 'href' }
@urls << url[1] if url
end
end
extractor = LinkGrabber.new
extractor.feed(html)
extractor.urls |
#<Set: {"http://www.ruby-lang.org/", "http://www.oreilly.com"}> | #<Set: {"http://www.ruby-lang.org/", "http://www.oreilly.com"}> |
require 'rubyful_soup'
urls = Set.new
BeautifulStoneSoup.new(html).find_all('a').each do |tag|
urls << tag['href'] if tag['href']
end
puts BeautifulStoneSoup.new(html, :parse_only_these => 'a') |
<a name="anchor"></a> <a href="http://www.oreilly.com">O'Reilly</a> <a href="http://www.ruby-lang.org/">Ruby</a> |
<a name="anchor"></a><a href="http://www.oreilly.com">O'Reilly</a><a href="http://www.ruby-lang.org/">Ruby</a> |