#!/usr/bin/python """Ass-Kicking Laser Algorithm by Leonard Richardson (leonardr@segfault.org) This misleadingly named library provides a Python interface to weblogs.com's XML file of recently updated weblogs. There's lots of Python code to ping weblogs.com, but I couldn't find any to parse that XML file. So I wrote this. It defines one method: getWeblogs(limit=-1, cutoffTime=None, convertTimes=0) This method will give you the whole weblogs.com data structure or a recent subset of it. You can set a limit on the number of recent entries you want, or you can provide the last-update time you got from getWeblogs last time you polled, so that it knows where in the list to stop. You can also have it convert the weblogs.com time format into a Python tuple, and use that tuple to determine the time (expressed in seconds since epoch) each weblog in the list did its ping. This module has no external dependencies. For a demo, see the bottom of this file, or run it as a script. Version history: 1.1 Changed convertTime into convertTimes, which not only parses the date/time in weblogUpdates, but uses that time to turn each weblog's relative update time (expressed as a number of seconds before the weblogUpdates time) into an absolute time. Now that we have absolute times, it makes a lot more sense to specify a cutoff time than a list of cutoff URLs, so I got rid of one in favor of the other. 1.0 Initial release """ __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "1.1" __copyright__ = "Copyright (c) 2004 Leonard Richardson" __license__ = "Python" import urllib import sgmllib import time URL = "http://www.weblogs.com/changes.xml" TIME_FORMAT = '%a, %d %b %Y %H:%M:%S %Z' class opener(urllib.FancyURLopener): #Taken from Frederick Lundh's Technorati API. def http_error_default(self, url, fp, errcode, errmsg, headers, data=None): raise IOError, "HTTP error %s fetching http:%s" % (errcode, url) class LimitReachedException(Exception): pass class WeblogsParser(sgmllib.SGMLParser): def __init__(self, limit=0, cutoffTime=None, convertTimes=0): sgmllib.SGMLParser.__init__(self) self.convertTimes = convertTimes self.limit = limit self.counter = 0 if type(cutoffTime) == type(()): cutoffTime = time.localtime(cutofftime) self.cutoffTime = cutoffTime self.metadata = {} self.weblogs = [] def start_weblogupdates(self, attrs): for key, val in attrs: import time if key == 'updated' and (self.cutoffTime or self.convertTimes): parsedTime = time.strptime(val, TIME_FORMAT) self.baseTime = time.mktime(parsedTime) if self.convertTimes: val = parsedTime self.metadata[key] = val def do_weblog(self, attrs): data = {} for (key, val) in attrs: data[key] = val if data.get('when') and (self.convertTimes or self.cutoffTime): #Turn the relative update time into an absolute time. absoluteTime = self.baseTime-int(data['when']) if self.cutoffTime and absoluteTime <= self.cutoffTime: raise LimitReachedException, self.cutoffTime if self.convertTimes: data['when'] = time.localtime(absoluteTime) self.weblogs.append(data) self.counter = self.counter + 1 if self.limit == self.counter: raise LimitReachedException, self.limit def getWeblogs(limit=-1, cutoffTime=None, convertTimes=0): """Returns a list of updated weblogs. To get fewer than the entire set of results, you can pass in a numeric limit, or a list of weblogs you've already seen in the list. You shouldn't need to pass in any more than the 2 or 3 most recent weblogs you saw on the list. Pass in 1 to convertTimes to parse the file update time and use it to make all the weblog update times into absolute update times.""" text = opener().open(URL).read() parser = WeblogsParser(limit, cutoffTime, convertTimes) try: parser.feed(text) parser.close() except LimitReachedException: #This is normal, except for the fact that I'm using exceptions #to do control flow. pass return parser.metadata, parser.weblogs if __name__ == '__main__': #Run through a quick self-test/demo. a = time.time() metadata, first = getWeblogs(convertTimes=1) b = time.time() print "Elapsed:", (b-a) print 'Current weblogs.com metadata:', metadata print len(first), 'weblogs currently in the list' if first: print "Most recently updated weblog: ", first[0] print getWeblogs(cutoffTime=cutoff) testIndex = 50 if len(first) > testIndex: print "Testing numeric cutoff." a = time.time() print len(getWeblogs(testIndex)[1]), '== 50' b = time.time() print "Elapsed:", (b-a) #This should always happen for any reasonable value of testIndex, #but just to be safe. print "Trying time-based cutoff." print "This test should only get the most recent %s entries." % (testIndex-1) cutoff = first[testIndex-1]['when'] updated = getWeblogs(cutoffTime=cutoff) print len(updated[1]), '==', testIndex-1 else: print "Couldn't run fancy cutoff test because there weren't %s entries in the weblogs.com data, which is a little weird--probably something's wrong with the site, or it's the post-apocalyptic future and there are fewer weblogs." % testIndex