# -*- coding: utf-8 -*- """ This module can do slight modifications to a wiki page source code such that the code looks cleaner. The changes are not supposed to change the look of the rendered wiki page. WARNING: This module needs more testing! """ import wikipedia, pagegenerators import sys import re # Summary message when using this module as a stand-alone script msg_standalone = { 'de': u'Bot: Kosmetische Änderungen', 'en': u'Robot: Cosmetic changes', 'pt': u'Bot: Mudanças triviais', } # Summary message that will be appended to the normal message when # cosmetic changes are made on the fly msg_append = { 'de': u'; kosmetische Änderungen', 'en': u'; cosmetic changes', 'pt': u'; mudanças triviais', } deprecatedTemplates = { 'wikipedia': { 'de': [ u'Stub', ] } } class CosmeticChangesToolkit: def __init__(self, site, debug = False): self.site = site self.debug = debug def change(self, text): """ Given a wiki source code text, returns the cleaned up version. """ oldText = text text = self.standardizeInterwiki(text) text = self.standardizeCategories(text) text = self.cleanUpLinks(text) text = self.cleanUpSectionHeaders(text) text = self.translateAndCapitalizeNamespaces(text) text = self.removeDeprecatedTemplates(text) text = self.resolveHtmlEntities(text) text = self.validXhtml(text) text = self.removeUselessSpaces(text) if self.debug: wikipedia.showDiff(oldText, text) return text def standardizeInterwiki(self, text): """ Makes sure that interwiki links are put to the correct position and into the right order. """ interwikiLinks = wikipedia.getLanguageLinks(text, insite = self.site) text = wikipedia.replaceLanguageLinks(text, interwikiLinks, site = self.site) return text def standardizeCategories(self, text): """ Makes sure that interwiki links are put to the correct position, but does not sort them. """ categories = wikipedia.getCategoryLinks(text, site = self.site) text = wikipedia.replaceCategoryLinks(text, categories, site = self.site) return text def translateAndCapitalizeNamespaces(self, text): """ Makes sure that localized namespace names are used. """ family = self.site.family for nsNumber in family.namespaces: thisNs = family.namespace(self.site.lang, nsNumber) defaultNs = family.namespace('_default', nsNumber) if thisNs != defaultNs: text = wikipedia.replaceExceptNowikiAndComments(text, r'\[\[\s*' + defaultNs + '\s*:(?P.*?)\]\]', r'[[' + thisNs + ':\g]]') if self.site.nocapitalize: for nsNumber in family.namespaces: thisNs = family.namespace(self.site.lang, nsNumber) lowerNs = thisNs[0].lower() + thisNs[1:] # this assumes that all NS names have length at least 2 text = wikipedia.replaceExceptNowikiAndComments(text, r'\[\[\s*' + lowerNs + '\s*:(?P.*?)\]\]', r'[[' + thisNs + ':\g]]') return text def cleanUpLinks(self, text): trailR = re.compile(self.site.linktrail()) # The regular expression which finds links. Results consist of four groups: # group title is the target page title, that is, everything before | or ]. # group section is the page section. It'll include the # to make life easier for us. # group label is the alternative link title, that's everything between | and ]. # group linktrail is the link trail, that's letters after ]] which are part of the word. # note that the definition of 'letter' varies from language to language. self.linkR = re.compile(r'\[\[(?P[^\]\|]+)(\|(?P