This page contains automated test results for code from O'Reilly's Ruby Cookbook. If this code looks interesting or useful, you might want to buy the whole book.
Extracting All the URLs from an HTML Document | ||
---|---|---|
Code | Expected | Actual |
require 'uri' text = %{"My homepage is at <a href="http://www.example.com/">http://www.example.com/</a>, and be sure to check out my weblog at http://www.example.com/blog/. Email me at <a href="mailto:bob@example.com">bob@example.com</a>.} URI.extract(text) |
["http://www.example.com/", "http://www.example.com/", | ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/.", "mailto:bob@example.com"] |
URI.extract(text, ['http', 'https']) |
["http://www.example.com/", "http://www.example.com/" | ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/."] |
require 'rexml/document' require 'rexml/streamlistener' require 'set' class LinkGrabber include REXML::StreamListener attr_reader :links def initialize(interesting_tags = {'a' => %w{href}, 'img' => %w{src}}.freeze) @tags = interesting_tags @links = Set.new end def tag_start(name, attrs) @tags[name].each do |uri_attr| @links << attrs[uri_attr] if attrs[uri_attr] end if @tags[name] end def parse(text) REXML::Document.parse_stream(text, self) end end grabber = LinkGrabber.new grabber.parse(text) grabber.links |
#<Set: {"http://www.example.com/", "mailto:bob@example.com"}> | #<Set: {"http://www.example.com/", "mailto:bob@example.com"}> |
END_CHARS = %{.,'?!:;} URI.extract(text, ['http']).collect { |u| END_CHARS.index(u[-1]) ? u.chop : u } |
["http://www.example.com/", "http://www.example.com/", | ["http://www.example.com/", "http://www.example.com/", "http://www.example.com/blog/"] |
class AbsoluteLinkGrabber < LinkGrabber include REXML::StreamListener attr_reader :links def initialize(original_url = nil, interesting_tags = {'a' => %w{href}, 'img' => %w{src}}.freeze) super(interesting_tags) @base = original_url end def tag_start(name, attrs) if name == 'base' @base = attrs['href'] end super end def parse(text) super # If we know of a base URL by the end of the document, use it to # change all relative URLs to absolute URLs. @links.collect! { |l| URI.join(@base, l) } if @base end end URL_LOCATIONS = { 'a' => %w{href}, 'area' => %w{href}, 'applet' => %w{classid}, 'base' => %w{href}, 'blockquote' => %w{cite}, 'body' => %w{background}, 'codebase' => %w{classid}, 'del' => %w{cite}, 'form' => %w{action}, 'frame' => %w{src longdesc}, 'iframe' => %w{src longdesc}, 'input' => %w{src usemap}, 'img' => %w{src longdesc usemap}, 'ins' => %w{cite}, 'link' => %w{href}, 'object' => %w{usemap archive codebase data}, 'profile' => %w{head}, 'q' => %w{cite}, 'script' => %w{src}}.freeze |
{"script"=>["src"], "del"=>["cite"], "a"=>["href"], "profile"=>["head"], "body"=>["background"], "ins"=>["cite"], "img"=>["src", "longdesc", "usemap"], "iframe"=>["src", "longdesc"], "blockquote"=>["cite"], "q"=>["cite"], "frame"=>["src", "longdesc"], "area"=>["href"], "link"=>["href"], "form"=>["action"], "codebase"=>["classid"], "applet"=>["classid"], "object"=>["usemap", "archive", "codebase", "data"], "input"=>["src", "usemap"], "base"=>["href"]} |