# -*- coding: iso-8859-1 -*- """ (C) 2003 Thomas R. Koll, Distributed under the terms of the MIT license. """ __version__='$Id: WdTXMLParser.py,v 1.1 2006/02/02 16:58:03 wikipedian Exp $' DEBUG = 0 import re from xml.sax.handler import ContentHandler class WdTXMLParser(ContentHandler): def __init__(self): self.rTitle = re.compile ('(.*): (.*)') self.rLink = re.compile ('.*[\r\n]*(http://.*)') self.rCount = re.compile ('.*: (\d*)') self.inItem = 0 self.inITitle = 0 self.inILink = 0 self.inIDescription = 0 self.tmp = {} self.result = {} def startDocument(self): self.result = {} self.tmp = {} def endDocument(self): return self.result def startElement(self, name, attrs): if name == 'item': self.inItem = 1 if self.inItem == 1: if name == 'title': self.inTitle = 1 if name == 'link': self.inLink = 1 if name == 'description': self.inDescription = 1 def characters(self, characters): if self.inItem: if self.inTitle: self.tmp['title'] = self.rTitle.match(characters).group(2) if self.inLink: self.tmp['link'] = self.rLink.match(characters).group(1) if self.inDescription: self.tmp['count'] = self.rCount.match(characters).group(1) def endElement(self, name): if name == 'item': self.inItem = 0 self.result[self.tmp['title']] = { 'link' : self.tmp['link'], 'count' : self.tmp['count'] } self.tmp = {} if name == 'title': self.inTitle = 0 if name == 'link': self.inLink = 0 if name == 'description': self.inDescription = 0 """ if self.date and self.link and self.count: self.results[self.title] = { 'date' : self.date, 'link' : self.link, 'count' : self.count } """