 
 This page contains automated test results for code from O'Reilly's Ruby Cookbook. If this code looks interesting or useful, you might want to buy the whole book.
| Parsing Invalid Markup | ||
|---|---|---|
| Code | Expected | Actual | 
| require 'rubygems' require 'rubyful_soup' invalid_html = 'A lot of <b class=1>tags are <i class=2>never closed.' soup = BeautifulSoup.new(invalid_html) puts soup.prettify | A lot of <b class="1">tags are <i class="2">never closed. </i> </b> | A lot of <b class="1">tags are <i class="2">never closed. </i> </b> | 
| soup.b.i | <i class="2">never closed.</i> | <i class="2">never closed.</i> | 
| soup.i | <i class="2">never closed.</i> | <i class="2">never closed.</i> | 
| soup.find(nil, :attrs=>{'class' => '2'}) | <i class="2">never closed.</i> | <i class="2">never closed.</i> | 
| soup.find_all('i') | [<i class="2">never closed.</i>] | [<i class="2">never closed.</i>] | 
| soup.b['class'] | "1" | "1" | 
| soup.find_text(/closed/) | "never closed." | "never closed." | 
| require 'rubygems'
require 'html/sgml-parser'
require 'set'
html = %{<a name="anchor"><a href="http://www.oreilly.com">O'Reilly</a>
         <b>irrelevant</b><a href="http://www.ruby-lang.org/">Ruby</a>}
class LinkGrabber < HTML::SGMLParser
  attr_reader :urls
  def initialize
    @urls = Set.new
    super
  end
  def do_a(attrs)
    url = attrs.find { |attr| attr[0] == 'href' }
    @urls << url[1] if url
  end
end
extractor = LinkGrabber.new
extractor.feed(html)
extractor.urls | #<Set: {"http://www.ruby-lang.org/", "http://www.oreilly.com"}> | #<Set: {"http://www.ruby-lang.org/", "http://www.oreilly.com"}> | 
| require 'rubyful_soup'
urls = Set.new
BeautifulStoneSoup.new(html).find_all('a').each do |tag|
  urls << tag['href'] if tag['href']
end
puts BeautifulStoneSoup.new(html, :parse_only_these => 'a') | <a name="anchor"></a> <a href="http://www.oreilly.com">O'Reilly</a> <a href="http://www.ruby-lang.org/">Ruby</a> | <a name="anchor"></a><a href="http://www.oreilly.com">O'Reilly</a><a href="http://www.ruby-lang.org/">Ruby</a> |