#!/usr/bin/env python """Downhill: finds the shortest link path between weblogs. This will load data from the weblog ecosystem at http://www.myelin.co.nz/ecosystem/, and use it to find the shortest path between any two weblogs using a breadth-first search. It keeps the ecosystem data in a dbm file of its own design so as not to have to unpickle Python object every time. The first time you run it, it will try to find the ecosystem pickle file and generate a dbm file from it.""" __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "1.2" __date__ = "$Date: 2003/06/06 01:55:44 $" __copyright__ = "Copyright (c) 2003 Leonard Richardson" __license__ = "Python" import anydbm import copy import os import gdbm import random import string import sys import time import types import urllib from blog import * DEBUG = 0 CGI_FOOTER = '

Brought to you by crummy.com.' CGI_STYLE = 'a.app { color: green }' baseDir = os.path.split(sys.argv[0])[0] DATABASE = os.path.join(baseDir, 'linkData-%s.db') PICKLE_FILE = os.path.join(baseDir, 'linkData.pickle') fields = ['name', 'url', 'fwdLinks', 'backLinks'] #If the database doesn't exist, try to create it from the pickle file. if not os.path.exists(DATABASE % fields[0]): import pickle if not os.path.exists(PICKLE_FILE): raise 'Could find neither database nor pickle file!' blogs = pickle.load(open(PICKLE_FILE)) db = {} for field in fields: db[field] = gdbm.open(DATABASE % field, 'n') for (url, blog) in blogs.items(): if blog.fwdLinks or blog.backLinks: db['name'][url] = blog.name db['url'][url] = blog.url db['fwdLinks'][url] = string.join(blog.fwdLinks.keys(), '|') db['backLinks'][url] = string.join(blog.backLinks.keys(), '|') for field in fields: db[field].close() db = {} for field in fields: db[field] = anydbm.open(DATABASE % field) pathTo = {} fieldCache = {} def getField(url, fieldName): "Retrieves the named field from the database for the given URL." field = fieldCache.get((url, fieldName)) if not field: key = genericLink(url) if db[fieldName].has_key(key): field = db[fieldName][key] else: field = None fieldCache[(url, fieldName)] = field return field def getLinks(url, direction=1): "Returns incoming or outgoing links, depending on the direction parameter." if direction == 1: field = 'fwdLinks' else: field = 'backLinks' links = getField(url, field) if links: return string.split(links, '|') else: return [] def getMatchingBlogs(url, limit=10): """Find some number of blogs whose URLs look like the provided URL.""" c = 8 url = genericLink(url) matches = [] toCheck = db['name'].keys() while c <= len(url): matches = [] prefix = url[:c] for blog in toCheck: if blog[:c] == prefix: matches.append(blog) if not matches: matches = toCheck break elif len(matches) < limit: break else: c = c + 1 if len(matches): toCheck = copy.copy(matches) matches.sort() if len(matches) > limit: matches = matches[:limit] return matches def getHTMLLink(key): "Returns an HTML link of blog url to blog name." url = getField(key, 'url') name = getField(key, 'name') if not name: name = '[Unknown weblog]' if url: link = '%s' % (url, name) else: link = name return link def getShortestPath(srcURL, otherURL, exclude=[]): """Find the shortest path from one URL to another, excluding any URLs in exclude. Does two breadth-first searches, following forward links from the start and backwards links from the destination.""" fwdFrontier = [srcURL] fullFwdFrontier = copy.copy(fwdFrontier) backFrontier = [otherURL] fullBackFrontier = copy.copy(backFrontier) pathTo[srcURL] = [srcURL] pathTo[otherURL] = [otherURL] foundPath = [] if srcURL == otherURL: foundPath = pathTo[srcURL] while not foundPath and fwdFrontier and backFrontier: if len(fwdFrontier) < len(backFrontier): foundPath, fwdFrontier = expandFrontier(fwdFrontier, fullFwdFrontier, fullBackFrontier, 1, exclude) else: foundPath, backFrontier = expandFrontier(backFrontier, fullBackFrontier, fullFwdFrontier, -1, exclude) #To save having to copy lists, we put a blog's parent's path #into its path as a list. Now we need to unroll all the lists. ok = 0 while foundPath and not ok: ok = 1 for i in range(0, len(foundPath)): l = foundPath[i] if type(l) == types.ListType: ok = 0 foundPath[i] = l[0] if len(l) > 1: foundPath.insert(i+1, l[1]) if foundPath and type(foundPath) != types.ListType: foundPath = [foundPath] return foundPath def expandFrontier(frontier, fullFrontier, otherFullFrontier, direction, exclude=[]): """Look at the links for each blog in the frontier. If a link is to a blog found in the frontier for the other search, we've found the shortest way to connect the two searches. Return that. Otherwise, put the link into the new frontier for this search.""" newFrontier = [] foundPath = None for blog in frontier: #Shuffle the links so that we don't get the exact same path each #time. #l = getLinks(blog, direction) l = copy.copy(getLinks(blog, direction)) for i in range(0, len(l)): a = random.randint(i, len(l)-1) tmp = l[a] l[a] = l[i] l[i] = tmp for link in l: if not link in exclude: url = getField(link, 'url') if url: if not url in fullFrontier: fullFrontier.append(url) if not url in newFrontier: newFrontier.append(url) if not pathTo.get(url): path = [pathTo[blog], url] if direction == -1: path.reverse() pathTo[url] = [path] if url in otherFullFrontier: foundPath = pathTo[blog] foundPath.append(pathTo[url]) if direction == -1: foundPath.reverse() break del(l) if foundPath: break if newFrontier == frontier: newFrontier = [] return foundPath, newFrontier class App: def cleanInput(self, input): if string.find(input, 'http') == -1: input = 'http://' + input return input def findBlog(self, input): return getField(genericLink(input), 'url') def getRandomBlog(self): if not hasattr(self, 'allBlogs'): self.allBlogs = db['name'].keys() return random.choice(self.allBlogs) class CommandLineApp(App): def __init__(self): while 1: self.run() def getBlog(self, message="Enter the URL to a weblog, or [q]uit: "): blog = None while not blog: a = string.strip(raw_input(message)) if a == 'q': sys.exit(0) a = self.cleanInput(a) blog = self.findBlog(a) if not blog: print "Sorry, that's not in the dataset." matches = getMatchingBlogs(a, 10) if matches: print "Maybe you meant one of these?" for m in matches: print "", m print return blog def run(self): b1 = self.getBlog() b2 = self.getBlog("Enter the URL to another weblog, or [q]uit: ") if DEBUG: t1 = time.clock() path = getShortestPath(b1, b2) if DEBUG: t2 = time.clock() if path: l = len(path)-1 howMany = str(l) + ' degree' if l != 1: howMany = howMany + 's' print "There are %s of separation between %s and %s." % (howMany, b1, b2) print "%s (%s)" % (getField(path[0], 'name'), getField(path[0], 'url')) for blog in path[1:]: print " links to %s (%s)" % (getField(blog, 'name'), getField(blog, 'url')) else: print "Sorry, I couldn't find any path from %s to %s." % (b1, b2) pathTo.clear() if DEBUG: print "Calculated shortest path in %s seconds." % (t2-t1) print class APICGI(App): MAX_REQUEST = 1000 def __transformInput(self, s): if s: s = self.cleanInput(s) blog = self.findBlog(s) if blog: s = blog else: raise Exception, 'No such blog: %s' else: s = self.getRandomBlog() return s def __getField(self, url, field, split=0): url = self.__transformInput(url) val = getField(url, field) if split: val = string.split(val, '|') return val def getCanonicalURL(self, url): return self.__getField(url, 'url') def getName(self, url): return self.__getField(url, 'name') def getIncomingLinks(self, url): return self.__getField(url, 'fwdLinks', 1) def getOutgoingLinks(self, url): return self.__getField(url, 'backLinks', 1) def getMatchingURLs(self, prefix): if not prefix: raise Exception, 'No prefix provided: %s' % prefix prefix = self.cleanInput(prefix) return getMatchingBlogs(prefix) def shortestPath(self, url1, url2, exclude=[]): blog1 = self.__transformInput(url1) blog2 = self.__transformInput(url2) for i in range(0, len(exclude)): exclude[i] = self.__transformInput(exclude[i]) path = getShortestPath(blog1, blog2, exclude) if path == None: path = [] return path def giveup(self, message): print "Status: 400" print print "sorry,", message sys.exit(0) def __init__(self): import xmlrpclib self.handlers = { 'downhill' : self } if os.environ.get("REQUEST_METHOD") != "POST": self.giveup("invalid request") bytes = int(os.environ.get("CONTENT_LENGTH", 0)) if bytes > self.MAX_REQUEST: self.giveup("request too large") print 'Content-type: text/xml\n' params, method = xmlrpclib.loads(sys.stdin.read(bytes)) result = self.dispatch(method, params) if not isinstance(result, type(())): result = (result,) response = xmlrpclib.dumps(result, methodresponse=1) print 'Content-Length: %s' % len(response) print response def dispatch(self, method, params): packages = string.split(method, '.') result = 0 if len(packages) < 1: self.giveup("No package or method specified") elif len(packages) < 2: self.giveup("No package specified for method!") m = self.handlers.get(packages[0]) if not m: self.giveup("Unsupported package specified: %s" % packages[0]) for package in packages[1:]: m = getattr(m, package, None) if not m: break if m: try: result = apply(m, params) except Exception, fault: import xmlrpclib result = xmlrpclib.Fault(fault) return result class PathCGI(App): EXCLUDE = 'exclude' BLOG1 = 'blog1' BLOG2 = 'blog2' BLOG1OVER = 'blog1Override' BLOG2OVER = 'blog2Override' FIND = 'find' SET_OVERRIDE1 = 'setOverride1' SET_OVERRIDE2 = 'setOverride2' def __init__(self, pathInfo=None, queryString=None, variables={}): #Generic setup. if not queryString: queryString = os.environ.get('QUERY_STRING') if not pathInfo: pathInfo = os.environ.get('PATH_INFO') if pathInfo: pathInfo = pathInfo[1:] if pathInfo and len(pathInfo) >= 6 and string.lower(pathInfo[:6]) == 'xmlrpc': APICGI() return import cgi form = cgi.FieldStorage() self.variables = variables if not self.variables: for key in form.keys(): if type(form[key]) == types.ListType: value = [] for v in form[key]: value.append(v.value) else: value = form[key].value self.variables[key] = urllib.unquote(value) prot = string.lower(os.environ.get('SERVER_PROTOCOL', 'http')) prot = prot[:string.find(prot, '/')] self.thisURL = prot + '://' + os.environ.get('SERVER_NAME', '') + os.environ.get('SCRIPT_NAME', '') + '/' #End generic setup. self.exclude = self.variables.get(self.EXCLUDE, []) if type(self.exclude) == type(''): self.exclude = [self.exclude] self.usedDefaults = 0 for i in (1,2): self.getBlog(i) self.startPage() print '


' self.printForm() if self.blog1 and self.blog2 and not (self.blog1Matches or self.blog2Matches) and (not self.usedDefaults or self.variables.get(self.FIND)): print '
' path = self.printShortestPath() self.endPage() def getBlog(self, num): storeVar = 'blog%s' % num matchesVar = 'blog%sMatches' % num overrideCommand = getattr(self, 'SET_OVERRIDE%s' % num) overrideField = getattr(self, 'BLOG%sOVER' % num) blogField = getattr(self, 'BLOG%s' % num) setattr(self, storeVar, None) setattr(self, matchesVar, None) name = None if self.variables.get(overrideCommand): #They entered an ambiguous URL and then selected an entry. name = self.variables.get(overrideField) if not name: #They typed in a URL. name = self.variables.get(blogField) if not name: #They specified nothing; use a random default name = self.getRandomBlog() self.usedDefaults = 1 if name: name = self.cleanInput(name) blog = self.findBlog(name) if not blog: setattr(self, matchesVar, getMatchingBlogs(name)) setattr(self, storeVar, name) def startPage(self): print "Content-type: text/html\n" print 'Downhill v%s' % __version__ print '' % CGI_STYLE self.printIntro() def printIntro(self): print '

Downhill

' print '

This is like the Oracle of Bacon, but for weblogs. Based on the links between weblogs, it will try to find a path between any two you specify. The data comes from the blogging ecosystem, which is long out of date but the only publicly available dataset of its kind that I know of.

' def getExclusionArgs(self, notExcluded=[]): args = '' for i in self.exclude: if not i in notExcluded: args = args + '&' + self.EXCLUDE + '=' + urllib.quote(i) return args def getURL(self, blog1=None, blog2=None, notExcluded=[]): if not blog1: blog1 = self.blog1 if not blog2: blog2 = self.blog2 url = '%s?%s=%s&%s=%s%s' % (self.thisURL, self.BLOG1, urllib.quote(blog1), self.BLOG2, urllib.quote(blog2), self.getExclusionArgs(notExcluded)) return url def printShortestPath(self): if DEBUG: t1 = time.clock() path = getShortestPath(self.blog1, self.blog2, self.exclude) if DEBUG: t2 = time.clock() if path: l = len(path)-1 if l == 1: howMany = 'is %s degree' else: howMany = 'are %s degrees' howMany = howMany % l print "

There %s of separation between %s and %s.

" % (howMany, getHTMLLink(self.blog1), getHTMLLink(self.blog2)) print '
%s' % (getField(path[0], 'url'), getField(path[0], 'name')) for blog in path[1:]: print '
' if blog != path[-1]: print 'x   ' % (self.getURL(), self.EXCLUDE, urllib.quote(blog)) print 'links to %s' % getHTMLLink(blog) for blog in path[1:]: print '
' print '
' else: print "

Sorry, I couldn't find a path between %s and %s.

" % (getHTMLLink(self.blog1), getHTMLLink(self.blog2)) if DEBUG: print '(That took %s seconds)' % (t2-t1) if not path or len(path) != 1: print '

Try to find a reverse path

' % self.getURL(self.blog2, self.blog1) if self.exclude: print '

You chose to exclude the following URLs from consideration when calculating the shortest path:

' print '' if len(self.exclude) > 1: print 'Include all' % self.getURL(self.blog1, self.blog2, self.exclude) return path def endPage(self): print CGI_FOOTER print '' def printBlogField(self, desc, blogNum): otherNum = 2 if blogNum == 2: otherNum = 1 value = getattr(self, 'blog%s' % blogNum) otherValue = getattr(self, 'blog%s' % otherNum) var = getattr(self, 'BLOG%s' % blogNum) otherVar = getattr(self, 'BLOG%s' % otherNum) print '

%s:' % desc print '' % (var, value) print 'Random' % (self.thisURL, otherVar, urllib.quote(otherValue), self.getExclusionArgs(), self.FIND) print '

' replacements = getattr(self, 'blog%sMatches' % blogNum) if replacements: print "
(Sorry, %s isn't in my database; maybe you meant one of the following?)

" % value first = 0 for replacement in replacements: selected = '' if not first: selected = ' checked="checked"' print ' %s
' % (getattr(self, 'BLOG%sOVER' % blogNum), replacement, selected, replacement) first = 1 print '' % getattr(self, 'SET_OVERRIDE%s' % blogNum) print '
' def printForm(self): print '
' % self.thisURL for i in self.exclude: print '' % i self.printBlogField("Start at this URL", 1) self.printBlogField("End at this URL", 2) print '' print '
' if __name__ == '__main__': if os.environ.get('SERVER_PROTOCOL'): PathCGI() else: CommandLineApp()