#!/usr/bin/python import os import pickle import urllib import urlparse from BeautifulSoup import BeautifulSoup import socket socket.setdefaulttimeout(30) BASE = '/home/leonardr/public_html/features/DogBitesDog/' CACHE = os.path.join(BASE, 'cache/') class NewsScraper: HOMEPAGE = 'http://www.crummy.com/features/DogBitesDog/' DESCRIPTION = 'News headlines chopped up and shuffled.' def __init__(self, urls=None): if not urls: if hasattr(self, 'URLS'): urls = self.URLS else: urls = [] self.urls = urls self.baseURLs = {} for url in urls: t = urlparse.urlparse(url) self.baseURLs[url] = t[0] + '://' + t[1] self.links = None def getLinks(self): if self.links == None: self.links = self.fetchLinks() return self.links def fetchLinks(self): links = {} for url in self.urls: self.extractLinks(url, links) return links def extractLinks(self, url, bucket): s = BeautifulSoup() s.feed(urllib.urlopen(url).read()) for i in s.fetch('a', {'href' : self.LINK_DESIGNATOR}): newsURL = urlparse.urljoin(self.baseURLs[url], i['href']) description = str(i.contents[0]) bucket[description] = newsURL def pickleMe(self): links = self.getLinks() f = open(os.path.join(CACHE, self.KEY), "w") pickle.dump(links, f) f.close() class OffbeatScraper(NewsScraper): KEY = 'Offbeat' NAME = 'Recombinant offbeat news' URLS = ["http://www.azcentral.com/offbeat/"] LINK_DESIGNATOR = '/offbeat/articles/%' class NewsHubScraper(NewsScraper): KEY = 'WorldNationalScience' NAME = 'Recombinant news' URLS = ["http://www.newshub.com/hub.php?cat=18", "http://www.newshub.com/hub.php?cat=16", "http://www.newshub.com/hub.php?cat=13"] LINK_DESIGNATOR = 'rd.php%' class Everything(NewsScraper): KEY = 'Everything' NAME = 'All the Dog Bites Dog news sources jumbled together.' def getLinks(self): self.links = {} for scraper in scrapers: if scraper.KEY != self.KEY: for key, value in scraper.getLinks().items(): self.links[key] = value return self.links scraperClasses = {} for i in NewsHubScraper, OffbeatScraper, Everything: scraperClasses[i.KEY] = i if __name__ == '__main__': scrapers = [] for scraperClass in scraperClasses.values(): scraper = scraperClass() scrapers.append(scraper) scraper.pickleMe()