#!/usr/bin/python import copy import os import pickle import random import re from NewsScraper import scraperClasses, BASE, CACHE MORBID = ["die", "dies", "died", "dead", "death", "deaths", "kill", "kills", "killed", "killer", "killers", "killing", "killings", "suffers", "suffer", "hurt", "murder", "murdered", "murder", "murderers", "wound", "wounds", "wounded", "rape", "rapist", "rapists", "rapes"] #Hard cases: "Kremlin Man Wins in Chechnya" # n nv nv #Correct division is between "man" and "wins", but "Kremlin" is an unambigious noun #so the "Kremlin"/"Man" division is chosen instead. class WordList: PLURAL_SUFFIXES = ['s', 'es', 'ies'] TENSE_SUFFIXES = ['ed', 'd', 'ing'] SINGULAR_VERBS = ['is'] NOT_A_HEADLINE_VERB = ['man'] #It's sooooo unlikely. def __init__(self): self.verbs = {} self.nouns = {} CHOP = re.compile("^'|[^A-Za-z0-9-']|'$", re.I) def tag(self, words): noun, verb, verbPlural = [], [], [] for i in range(0, len(words)): word = words[i] if i == 0: #don't treat the first word as a proper noun word = word.lower() word = self.CHOP.sub('', word) n = self.isNoun(word) noun.append(n) v, p = self.isVerb(word) verb.append(v) verbPlural.append(p) return noun, verb, verbPlural def load(self, letter): verbs = {} nouns = {} try: file = open(os.path.join(BASE, 'data/base/%s.oli' % letter)) for i in file: sep = i.find(' ') word = i[:sep] parts = i[sep+1:i.find(' ', sep+1)] parts = parts[1:-1].split(',') if 'v' in parts or (len(word) > 3 and 'adv' in parts) and not word in self.NOT_A_HEADLINE_VERB: verbs[word] = 1 if 'n' in parts: nouns[word] = 1 except IOError: pass self.verbs[letter] = verbs self.nouns[letter] = nouns def getNouns(self, letter): if not self.nouns.get(letter): self.load(letter) return self.nouns[letter] def getVerbs(self, letter): if not self.verbs.get(letter): self.load(letter) return self.verbs[letter] def isNoun(self, origWord): "Returns true iff the word can be a noun." if not origWord: return 0, 0 noun = 0 word = origWord.lower() nouns = self.getNouns(word[0]) if nouns.get(word): noun = 1 else: if word != origWord and origWord.upper() != origWord: #Proper noun. But words in all caps might be proper nouns or they might just be in all caps. noun = 1 for i in self.PLURAL_SUFFIXES: found = word.rfind(i) if found != -1 and word[found:] == i: testWord = word[:found] if i == 'ies': testWord = testWord + 'y' if nouns.get(testWord): noun = 1 break if not noun: try: int(word) noun = 1 except ValueError: pass #if noun: # print "noun:", word return noun def isVerb(self, word): """Returns a tuple x,y. x is true iff the word can be a verb. y is true iff x is true and the noun looks like it takes a plural subject.""" if not word: return 0, 0 verb = 0 plural = 1 word = word.lower() verbs = self.getVerbs(word[0]) if verbs.get(word): verb = 1 else: for i in self.PLURAL_SUFFIXES: found = word.rfind(i) if found != -1 and word[found:] == i: testWord = word[:found] if i == 'ies': testWord = testWord + 'y' if verbs.get(testWord): verb = 1 plural = 0 break if not verb: for i in self.TENSE_SUFFIXES: found = word.rfind(i) if found != -1 and word[found:] == i and verbs.get(word[:found]): verb = 1 plural = 0 if not verb: plural = 0 if word in self.SINGULAR_VERBS: plural = 0 #if verb: # if plural: # p = "they" # else: # p = "it" # print "verb: %s %s " % (p, word) return verb, plural class Smusher: MORBID_RE = re.compile("((\A|[\s\W])(%s)(\Z|[\s\W]))" % "|".join(MORBID), re.I) def __init__(self, key, wordList): self.key = key self.singular = None self.plural = None self.totalLength = 0 self.list = wordList def mixup(self, number=5): mixed = [] for first, second in self.choose(number): linkText = '' for link in first, second: if link == first: index = 0 extra = ' ' else: index = 1 extra = '' linkText += '%s%s' % (link[2], link[0], link[1], link[index], extra) mixed.append([first[2], second[2], linkText + '' % (first[-1], second[-1])]) return mixed def isMorbid(self, str): """Returns true iff the string is too morbid to be funny.""" return self.MORBID_RE.search(str) != None def choose(self, number=5): self.fetchLinks(key) s = copy.copy(self.singular) p = copy.copy(self.plural) l = self.totalLength mixed = [] for i in range(0, number): if len(s) > 2 and (random.randint(0,l) < len(s[0]) or len(p) < 2): list = s elif len(p) > 2: list = p else: list = None if list: first = None while not first: firstIndex = random.randint(0, len(list)-1) first = list[firstIndex] del(list[firstIndex]) l -= 1 if self.isMorbid(first[0]): first = None second = None while not second: secondIndex = random.randint(0, len(list)-1) second = list[secondIndex] del(list[secondIndex]) l -= 1 if self.isMorbid(second[1]): second = None mixed.append([first, second]) return mixed def fetchLinks(self, key): if self.plural != None: return self.plural = [] self.singular = [] filename = os.path.join(CACHE, key) if not os.path.exists(filename): scraperClasses[key]().pickleMe() f = open(filename) self.links = pickle.load(f) f.close() for key, url in self.links.items(): subject, object, plural, rank = self.parseHeadline(key) l = self.singular #if plural: # l = self.plural if subject and object: l.append([subject, object, url, rank]) self.totalLength = len(self.singular) + len(self.plural) def findBestSplit(self, noun, verb): """Returns the best-ranking split location. A split location is a noun immediately followed by a verb. A noun that cannot be a verb followed by a verb that cannot be a noun is the best kind of split. If one of the words is ambiguous that's not as good, but it's better than both the words being ambiguous. An earlier split location is better than a later one of the same rank. Returns a list with the index and the rank given the index. This simple mechanism (look for a noun followed by a verb) works in about 75% of cases, so I'm happy.""" bestSoFar = None for i in range(0, len(noun)-1): if noun[i] == 1 and verb[i+1] == 1: #Let's figure out the rank. rank = 2 if not verb[i] and not noun[i+1]: rank = 0 elif not noun[i+1] or not verb[i]: rank = 1 if not bestSoFar or rank < bestSoFar[1]: bestSoFar = [i, rank] if rank == 0: #No point in looking any further break return bestSoFar def parseHeadline(self, headline): #print headline origWords = headline.split() words = origWords if '-' in headline: words = copy.copy(origWords) for i in range(0, len(origWords)): l = origWords[i].split('-') if len(l) > 1: words[i] = l[1] subject, object = None, None noun, verb, plural = self.list.tag(words) #print noun #print verb nLocAndRank = self.findBestSplit(noun, verb) if nLocAndRank: nLoc, rank = nLocAndRank #print words[nLoc] #print words[nLoc+1] vLoc = nLoc + 1 #print words[nLoc], words[vLoc], rank else: nLoc = -1 vLoc = -1 rank = -1 if vLoc != -1: subject = " ".join(origWords[:vLoc]) object = " ".join(origWords[vLoc:]) #if not subject: # print "Could not parse ", headline return subject, object, plural[vLoc], rank def asHTML(self, links): a = '\n' return a def asRSS(self, links, filename): import PyRSS2Gen import datetime scraper = scraperClasses[self.key] now = datetime.datetime.now() rss = PyRSS2Gen.RSS2(title = 'Dog Bites Dog: %s' % scraper.NAME, link = scraper.HOMEPAGE, description = scraper.DESCRIPTION, lastBuildDate = now, items = []) for first, second, story in links: link = first + '#' + second rss.items.append(PyRSS2Gen.RSSItem( description = story, link = link, guid = link, pubDate = now)) rss.write_xml(open(filename, 'w')) if __name__ == '__main__': for key in scraperClasses.keys(): list = WordList() smusher = Smusher(key, list) links = smusher.mixup(20) html = smusher.asHTML(links) f = open(os.path.join(BASE, "headlines-%s.html" % key), "w") f.write(html) f.close() smusher.asRSS(links, os.path.join(BASE, "rss-%s.xml" % key))