This page contains automated test results for code from O'Reilly's Ruby Cookbook. If this code looks interesting or useful, you might want to buy the whole book.

Extracting All the URLs from an HTML Document
CodeExpectedActual
require 'uri'
text = %{"My homepage is at 
<a href="http://www.example.com/">http://www.example.com/</a>, and be sure
to check out my weblog at http://www.example.com/blog/. Email me at <a
href="mailto:bob@example.com">bob@example.com</a>.}
URI.extract(text)
["http://www.example.com/", "http://www.example.com/", ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/.", "mailto:bob@example.com"]
URI.extract(text, ['http', 'https'])
["http://www.example.com/", "http://www.example.com/" ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/."]
require 'rexml/document'
require 'rexml/streamlistener'
require 'set'
class LinkGrabber
  include REXML::StreamListener
  attr_reader :links
 def initialize(interesting_tags = {'a' => %w{href}, 'img' => %w{src}}.freeze)
    @tags = interesting_tags
    @links = Set.new
  end
  def tag_start(name, attrs)
    @tags[name].each do |uri_attr|
      @links << attrs[uri_attr] if attrs[uri_attr]
    end if @tags[name]
  end
  def parse(text)
    REXML::Document.parse_stream(text, self)    
  end
end
grabber = LinkGrabber.new
grabber.parse(text)
grabber.links
#<Set: {"http://www.example.com/", "mailto:bob@example.com"}> #<Set: {"http://www.example.com/", "mailto:bob@example.com"}>
END_CHARS = %{.,'?!:;}
URI.extract(text, ['http']).collect { |u| END_CHARS.index(u[-1]) ? u.chop : u }
["http://www.example.com/", "http://www.example.com/", ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/"]
class AbsoluteLinkGrabber < LinkGrabber
  include REXML::StreamListener
  attr_reader :links
  def initialize(original_url = nil,
             interesting_tags = {'a' => %w{href}, 'img' => %w{src}}.freeze)
    super(interesting_tags)
    @base = original_url
  end
  def tag_start(name, attrs)    
    if name == 'base'
      @base = attrs['href']
    end
    super
  end
  def parse(text)
    super
    # If we know of a base URL by the end of the document, use it to
    # change all relative URLs to absolute URLs.
    @links.collect! { |l| URI.join(@base, l) } if @base
  end
end
URL_LOCATIONS = { 'a' => %w{href},
  'area' => %w{href},
  'applet' => %w{classid},
  'base' => %w{href},
  'blockquote' => %w{cite},
  'body' => %w{background},   
  'codebase' => %w{classid},
  'del' => %w{cite},
  'form' => %w{action},
  'frame' => %w{src longdesc},
  'iframe' => %w{src longdesc},
  'input' => %w{src usemap},
  'img' => %w{src longdesc usemap},
  'ins' => %w{cite},
  'link' => %w{href},
  'object' => %w{usemap archive codebase data},
  'profile' => %w{head},
  'q' => %w{cite},
  'script' => %w{src}}.freeze
{"script"=>["src"], "del"=>["cite"], "a"=>["href"], "profile"=>["head"], "body"=>["background"], "ins"=>["cite"], "img"=>["src", "longdesc", "usemap"], "iframe"=>["src", "longdesc"], "blockquote"=>["cite"], "q"=>["cite"], "frame"=>["src", "longdesc"], "area"=>["href"], "link"=>["href"], "form"=>["action"], "codebase"=>["classid"], "applet"=>["classid"], "object"=>["usemap", "archive", "codebase", "data"], "input"=>["src", "usemap"], "base"=>["href"]}