# -*- coding: utf-8 -*- """ Each XmlEntry object represents a page, as read from an XML source The MediaWikiXmlHandler can be used for the XML given by Special:Export as well as for XML dumps. The XmlDump class reads a pages_current XML dump (like the ones offered on http://download.wikimedia.org/wikipedia/de/) and offers a generator over XmlEntry objects which can be used by other bots. """ __version__='$Id: xmlreader.py,v 1.17 2006/03/10 11:54:45 a_engels Exp $' import threading, time import xml.sax import codecs, re import wikipedia def parseRestrictions(restrictions): ''' Parses the characters within a restrictions tag and returns strings representing user groups allowed to edit and to move a page, where None means there are no restrictions. ''' if not restrictions: return None, None editRestriction = None moveRestriction = None editLockMatch = re.search('edit=([^:]*)', restrictions) if editLockMatch: editRestriction = editLockMatch.group(1) moveLockMatch = re.search('move=([^:]*)', restrictions) if moveLockMatch: moveRestriction = moveLockMatch.group(1) if restrictions == 'sysop': editRestriction = 'sysop' moveRestriction = 'sysop' return editRestriction, moveRestriction class XmlEntry: """ Represents a page. """ def __init__(self, title, id, text, timestamp, editRestriction, moveRestriction, revisionid): # TODO: there are more tags we can read. self.title = title self.id = id self.text = text self.timestamp = timestamp self.editRestriction = editRestriction self.moveRestriction = moveRestriction self.revisionid = revisionid class XmlHeaderEntry: """ Represents a header entry """ def __init__(self): self.sitename = u'' self.base = u'' self.generator = u'' self.case = u'' self.namespaces = {} class MediaWikiXmlHandler(xml.sax.handler.ContentHandler): def __init__(self): xml.sax.handler.ContentHandler.__init__(self) self.inRevisionTag = False self.inContributorTag = False self.headercallback = None # Older Mediawiki version sometimes do not have these elements. # They are initialized here so they at least have some value when # asked for self.id = u'' self.revisionid = u'' def setCallback(self, callback): self.callback = callback def setHeaderCallback(self, headercallback): self.headercallback = headercallback def startElement(self, name, attrs): self.destination = None if name == 'page': self.editRestriction = None self.moveRestriction = None elif name == 'revision': self.inRevisionTag = True elif name == 'contributor': self.inContributorTag = True elif name == 'text': self.destination = 'text' self.text=u'' elif name == 'id': if self.inContributorTag: self.destination = 'userid' self.userid = u'' elif self.inRevisionTag: self.destination = 'revisionid' self.revisionid = u'' else: self.destination = 'id' self.id = u'' elif name == 'restrictions': self.destination = 'restrictions' self.restrictions = u'' elif name == 'title': self.destination = 'title' self.title=u'' elif name == 'timestamp': self.destination = 'timestamp' self.timestamp=u'' elif self.headercallback: if name == 'siteinfo': self.header = XmlHeaderEntry() elif name in ['sitename', 'base', 'generator', 'case']: self.destination = name elif name == 'namespace': self.destination = 'namespace' self.namespace = u'' self.namespaceid = int(attrs['key']) def endElement(self, name): if name == 'contributor': self.inContributorTag = False elif name == 'restrictions': self.editRestriction, self.moveRestriction = parseRestrictions(self.restrictions) if self.editRestriction: wikipedia.output(u'DBG: Edit restriction: %s' % self.editRestriction) if self.moveRestriction: wikipedia.output(u'DBG: Move restriction: %s' % self.moveRestriction) elif name == 'revision': # All done for this. text = self.text # Remove trailing newlines and spaces while text and text[-1] in '\n ': text = text[:-1] # Replace newline by cr/nl text = u'\r\n'.join(text.split('\n')) # Decode the timestamp timestamp = (self.timestamp[0:4]+ self.timestamp[5:7]+ self.timestamp[8:10]+ self.timestamp[11:13]+ self.timestamp[14:16]+ self.timestamp[17:19]) self.title = self.title.strip() # Report back to the caller entry = XmlEntry(self.title, self.id, text, timestamp, self.editRestriction, self.moveRestriction, self.revisionid) self.inRevisionTag = False self.callback(entry) elif self.headercallback: if name == 'namespace': self.header.namespaces[self.namespaceid] = self.namespace elif name == 'siteinfo': self.headercallback(self.header) self.header = None def characters(self, data): if self.destination == 'text': self.text += data elif self.destination == 'id': self.id += data elif self.destination == 'revisionid': self.revisionid += data elif self.destination == 'restrictions': self.restrictions += data elif self.destination == 'title': self.title += data elif self.destination == 'timestamp': self.timestamp += data elif self.headercallback: if self.destination == 'sitename': self.header.sitename += data elif self.destination == 'base': self.header.base += data elif self.destination == 'generator': self.header.generator += data elif self.destination == 'case': self.header.case += data elif self.destination == 'namespace': self.namespace += data class XmlParserThread(threading.Thread): """ This XML parser will run as a single thread. This allows the XmlDump generator to yield pages before the parser has finished reading the entire dump. There surely are more elegant ways to do this. """ def __init__(self, filename, handler): threading.Thread.__init__(self) self.filename = filename self.handler = handler def run(self): xml.sax.parse(self.filename, self.handler) class XmlDump(object): """ Represents an XML dump file. Reads the local file at initialization, parses it, and offers access to the resulting XmlEntries via a generator. NOTE: This used to be done by a SAX parser, but this solution with regular expressions is about 10 to 20 times faster. """ def __init__(self, filename): self.filename = filename def parse(self): ''' Generator which reads some lines from the XML dump file, and parses them to create XmlEntry objects. Stops when the end of file is reached. ''' Rpage = re.compile('\s*(?P<title>.+?)\s*(?P\d+?)\s*((?P.+?))?\s*\s*(?P\d+?)\s*(?P.+?)\s*\s*((?P.+?)\s*(?P\d+?)|(?P.+?))\s*\s*(?P())?\s*(?:(?P.+?)\s*)?((?P.*?)|)\s*\s*', re.DOTALL) f = codecs.open(self.filename, 'r', encoding = wikipedia.myencoding(), errors='replace') print 'Reading XML dump...' eof = False lines = u'' while not eof: line = f.readline() lines += line if line == '': eof = True elif line.endswith(u'\n'): # unescape characters lines = lines.replace('>', '>') lines = lines.replace('<', '<') lines = lines.replace('"', '"') lines = lines.replace('&', '&') m = Rpage.search(lines) if not m: print 'ERROR: could not parse these lines:' print lines lines = u'' else: lines = u'' text = m.group('text') or u'' restrictions = m.group('restrictions') editRestriction, moveRestriction = parseRestrictions(restrictions) # we don't care about the revisionid. entry = XmlEntry(title = m.group('title'), id = m.group('pageid'), text = text, timestamp = m.group('timestamp'), editRestriction = editRestriction, moveRestriction = moveRestriction, revisionid = m.group('revisionid')) yield entry