# find all rss feeds within one click of a URL __VERSION__ = "0.02" __DATE__ = "2007-7-31" __AUTHOR__ = "Phillip Pearson, Thad Kerosky" __COPYRIGHT__ = "Copyright (C) 2003 Phillip Pearson, Thad Kerosky" __LICENSE__ = "Python" __HISTORY__ = """ 0.01 - PP - initial release 0.02 - TK - Extended to take explicit RSS links and output opml """ import string,sgmllib, urllib, rssfinder, sys from cgi import escape class parser(sgmllib.SGMLParser): def __init__(self, root_url, verbose=0): sgmllib.SGMLParser.__init__(self, verbose) self.root_url = root_url self.found_urls = {} self.url = [] self.title = [] self.data = [] self.mainTitle = [] def handle_data(self, data): if self.data is not None: self.data.append(data) def start_link(self, attrs): self.url = [] self.title = [] def end_link(self): self.url = self.data self.data = [] def start_title(self,attrs): self.title = [] self.data = [] def end_title(self): if (self.mainTitle == [] ): self.mainTitle = self.data; self.title = self.data self.data = [] def start_item(self,attrs): self.title = [] self.url = [] self.data = [] def end_item(self): url = urllib.basejoin(self.root_url, string.join(self.url, "")) if not url.startswith(self.root_url): # self.found_urls[url] = 1 self.found_urls[url] = string.join(self.title,"") return # def start_rss(self, attrs): # print "begin" # def end_rss(self): # raise FoundLink # abort parsing! def findall(url, excl): txt = urllib.urlopen(url).read() p = parser(url) try: p.feed(txt) p.close() except: htmlfn = 'bad.html' open(htmlfn, 'wt').write(txt) print "exception parsing html; saved as %s" % htmlfn raise def link_ok(link): if excl and link.find(excl) != -1: return 0 if not link.startswith('http://'): return 0 for bad_start in ('http://locahost', 'http://127.0.0.1'): if link.startswith(bad_start): return 0 return 1 links = [x for x in p.found_urls.keys() if link_ok(x)] links.sort() print >> sys.stderr, \ "\nFound the following",len(links), "links: \n", for link in links: print >> sys.stderr, \ "\t"+link , p.found_urls[link] print >> sys.stderr, \ "\nFinding RSS feeds ..." print "\n\n\t\n\t\tFeeds in URL %s\n\t\n\t" % escape(string.join(p.mainTitle,"")) feeds = 0 for k in links: print >> sys.stderr, \ "Feeds for \"%s\" at %s" % (p.found_urls[k],k) try: feedage = rssfinder.getFeeds(k) for f in feedage: print "\n\t" % (escape(p.found_urls[k]),escape(p.found_urls[k]),"rss",k,f) feeds = feeds+1 if (len(feedage) < 1): print "\n\t" % (p.found_urls[k],k) if (len(feedage) > 1): print >> sys.stderr, \ feedage except sgmllib.SGMLParseError: print >> sys.stderr, \ "\t(parse error)" except IOError: print >> sys.stderr, \ "\t(failed to fetch)" print "\n\t\n" print >> sys.stderr, \ "%s Feeds noticed" % feeds if __name__=='__main__': # Syntax: python oneclick.py [url] [exclusion-pattern] # # e.g. python oneclick.py http://scripting.com/ archive.scripting.com # analyses scripting.com, but avoids scripting.com archive pages # # e.g. python oneclick.py http://pyds.muensterland.org/ # analyses georg's blog # # e.g. python oneclick.py # analyses my blog excl = None if len(sys.argv) > 1: url = sys.argv[1] # url to check if len(sys.argv) > 2: excl = sys.argv[2] # exclusion pattern else: url = "http://www.myelin.co.nz/post/" findall(url, excl)