This page contains automated test results for code from O'Reilly's Ruby Cookbook. If this code looks interesting or useful, you might want to buy the whole book.
| Extracting All the URLs from an HTML Document | ||
|---|---|---|
| Code | Expected | Actual |
require 'uri'
text = %{"My homepage is at
<a href="http://www.example.com/">http://www.example.com/</a>, and be sure
to check out my weblog at http://www.example.com/blog/. Email me at <a
href="mailto:bob@example.com">bob@example.com</a>.}
URI.extract(text) |
["http://www.example.com/", "http://www.example.com/", | ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/.", "mailto:bob@example.com"] |
URI.extract(text, ['http', 'https']) |
["http://www.example.com/", "http://www.example.com/" | ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/."] |
require 'rexml/document'
require 'rexml/streamlistener'
require 'set'
class LinkGrabber
include REXML::StreamListener
attr_reader :links
def initialize(interesting_tags = {'a' => %w{href}, 'img' => %w{src}}.freeze)
@tags = interesting_tags
@links = Set.new
end
def tag_start(name, attrs)
@tags[name].each do |uri_attr|
@links << attrs[uri_attr] if attrs[uri_attr]
end if @tags[name]
end
def parse(text)
REXML::Document.parse_stream(text, self)
end
end
grabber = LinkGrabber.new
grabber.parse(text)
grabber.links |
#<Set: {"http://www.example.com/", "mailto:bob@example.com"}> | #<Set: {"http://www.example.com/", "mailto:bob@example.com"}> |
END_CHARS = %{.,'?!:;}
URI.extract(text, ['http']).collect { |u| END_CHARS.index(u[-1]) ? u.chop : u } |
["http://www.example.com/", "http://www.example.com/", | ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/"] |
class AbsoluteLinkGrabber < LinkGrabber
include REXML::StreamListener
attr_reader :links
def initialize(original_url = nil,
interesting_tags = {'a' => %w{href}, 'img' => %w{src}}.freeze)
super(interesting_tags)
@base = original_url
end
def tag_start(name, attrs)
if name == 'base'
@base = attrs['href']
end
super
end
def parse(text)
super
# If we know of a base URL by the end of the document, use it to
# change all relative URLs to absolute URLs.
@links.collect! { |l| URI.join(@base, l) } if @base
end
end
URL_LOCATIONS = { 'a' => %w{href},
'area' => %w{href},
'applet' => %w{classid},
'base' => %w{href},
'blockquote' => %w{cite},
'body' => %w{background},
'codebase' => %w{classid},
'del' => %w{cite},
'form' => %w{action},
'frame' => %w{src longdesc},
'iframe' => %w{src longdesc},
'input' => %w{src usemap},
'img' => %w{src longdesc usemap},
'ins' => %w{cite},
'link' => %w{href},
'object' => %w{usemap archive codebase data},
'profile' => %w{head},
'q' => %w{cite},
'script' => %w{src}}.freeze |
{"script"=>["src"], "del"=>["cite"], "a"=>["href"], "profile"=>["head"], "body"=>["background"], "ins"=>["cite"], "img"=>["src", "longdesc", "usemap"], "iframe"=>["src", "longdesc"], "blockquote"=>["cite"], "q"=>["cite"], "frame"=>["src", "longdesc"], "area"=>["href"], "link"=>["href"], "form"=>["action"], "codebase"=>["classid"], "applet"=>["classid"], "object"=>["usemap", "archive", "codebase", "data"], "input"=>["src", "usemap"], "base"=>["href"]} | |