# -*- coding: utf-8 -*- """ Script to resolve double redirects, and to delete broken redirects. Requires access to MediaWiki's maintenance pages or to a XML dump file. Delete function requires adminship. Syntax: python redirect.py action [-argument] where action can be one of these: * double - fix redirects which point to other redirects * broken - delete redirects where targets don\'t exist. Requires adminship. and argument can be: * xml - retrieve information from a local XML dump (http://download.wikimedia.org). Argument can also be given as "-xml:filename.xml". If this argument isn't given, info will be loaded from a special page of the live wiki. * namespace:n - Namespace to process. Works only with an XML dump. Currently not supported! * restart:n - Number of redirect to restart with (see progress). Works only with an XML dump. Currently not supported! """ # # (C) Daniel Herding, 2004 # # Distributed under the terms of the MIT license. # __version__='$Id: redirect.py,v 1.42 2006/03/11 14:41:18 wikipedian Exp $' # from __future__ import generators import wikipedia, config import xmlreader import re, sys # Summary message for fixing double redirects msg_double={ 'en':u'Robot: Fixing double redirect', 'de':u'Bot: Korrigiere doppelten Redirect', 'fr':u'Robot : répare double redirection', 'ia':u'Robot: reparation de duple redirection', 'is':u'Vélmenni: Lagfæri tvöfalda tilvísun', 'pt':u'Bot: Corrigido duplo redirecionamento', 'sr':u'Бот: Поправка дуплих преусмерења', } # Reason for deleting broken redirects reason_broken={ 'en':u'Robot: Redirect target doesn\'t exist', 'de':u'Bot: Weiterleitungsziel existiert nicht', 'fr':u'Robot : Cible du redirect inexistante', 'ia':u'Robot: Scopo del redirection non existe', 'pt':u'Bot: Redirecionamento não existe', 'sr':u'Бот: Преусмерење не постоји', } class RedirectGenerator: def __init__(self, xmlFilename = None, namespace = -1, restart = -1): self.xmlFilename = xmlFilename self.namespace = namespace self.restart = restart def get_redirects_from_dump(self): ''' Loads a local XML dump file, looks at all pages which have the redirect flag set, and finds out where they're pointing at. Returns a dictionary where the redirect names are the keys and the redirect targets are the values. ''' xmlFilename = self.xmlFilename dict = {} # open xml dump and read page titles out of it dump = xmlreader.XmlDump(xmlFilename) redirR = wikipedia.getSite().redirectRegex() readPagesCount = 0 for entry in dump.parse(): readPagesCount += 1 # always print status message after 10000 pages if readPagesCount % 10000 == 0: print '%i pages read...' % readPagesCount # if self.namespace != -1 and self.namespace != entry.namespace: # continue m = redirR.search(entry.text) if m: target = m.group(1) # There might be redirects to another wiki. Ignore these. for code in wikipedia.getSite().family.langs.keys(): if target.startswith('%s:' % code) or target.startswith(':%s:' % code): # TODO: doesn't seem to work wikipedia.output(u'NOTE: Ignoring %s which is a redirect to %s:' % (entry.title, code)) target = None break # if the redirect does not link to another wiki if target: target = target.replace(' ', '_') # remove leading and trailing whitespace target = target.strip() # capitalize the first letter if not wikipedia.getSite().nocapitalize: target = target[0].upper() + target[1:] if '#' in target: target = target[:target.index('#')] if '|' in target: wikipedia.output(u'HINT: %s is a redirect with a pipelink.' % entry.title) target = target[:target.index('|')] dict[entry.title] = target return dict def retrieve_broken_redirects(self): if self.xmlFilename == None: # retrieve information from the live wiki's maintenance page mysite = wikipedia.getSite() # broken redirect maintenance page's URL path = mysite.broken_redirects_address(default_limit = False) print 'Retrieving special page...' maintenance_txt = mysite.getUrl(path) # regular expression which finds redirects which point to a non-existing page inside the HTML Rredir = re.compile('\