#!/usr/bin/python # -*- coding: utf-8 -*- """ Nifty script to convert HTML-tables to MediaWiki's own syntax. -xml - Retrieve information from a local XML dump (pages_current, see http://download.wikimedia.org). Argument can also be given as "-xml:filename". Searches for pages with HTML tables, and tries to convert them on the live wiki. -sql - Retrieve information from a local mirror. Searches for pages with HTML tables, and tries to convert them on the live wiki. -file:filename - Will read any [[wikipedia link]] and use these articles This SQL query might be helpful to generate this file, but it doesn't work for MediaWiki version 1.5 and above. SELECT CONCAT('[[', cur_title, ']]') FROM cur WHERE (cur_text LIKE '% Corrects attributes of tags KNOWN BUGS Broken HTML tables will most likely result in broken wiki tables! Please check every article you change. """ # (C) 2003 Thomas R. Koll, # # Distributed under the terms of the MIT license. __version__='$Id: table2wiki.py,v 1.82 2006/03/11 23:45:29 wikipedian Exp $' import re, sys, time import wikipedia, config, pagegenerators msg_no_warnings = {'de':'Bot: Tabellensyntax konvertiert', 'en':'User-controlled Bot: table syntax updated', 'es':'Bot controlado: actualizada sintaxis de tabla', 'ia':'Robot controlate: Syntaxe del tabella cambiate de HTML a Wiki', 'nl':'Tabel gewijzigd van HTML- naar Wikisyntax', 'pt':'Bot: Sintaxe da tabela HTML para Wiki atualizada', } msg_one_warning = {'de':'Bot: Tabellensyntax konvertiert - %d Warnung!', 'en':'User-controlled Bot: table syntax updated - %d warning!', 'es':'Bot controlado: actualizada sintaxis de tabla - %d aviso!', 'ia':'Robot controlate: Syntaxe del tabella cambiate - %d advertimento!', 'nl':'Tabel gewijzigd van HTML- naar Wikisyntax - %d waarschuwing!', 'pt':'Bot: Sintaxe da tabela HTML para Wiki atualizada - %d aviso', } msg_multiple_warnings = {'de':'Bot: Tabellensyntax konvertiert - %d Warnungen!', 'en':'User-controlled Bot: table syntax updated - %d warnings!', 'es':'Bot controlado: actualizada sintaxis de tabla - %d avisos!', 'ia':'Robot controlate: Syntaxe del tabella cambiate - %d advertimentos!', 'nl':'Tabel gewijzigd van HTML- naar Wikisyntax - %d waarschuwingen!', 'pt':'Bot: Sintaxe da tabela HTML para Wiki atualizada - %d avisos', } class TableXmlDumpPageGenerator: ''' A page generator that will yield all pages that seem to contain an HTML table. ''' def __init__(self, xmlfilename): import xmlreader self.xmldump = xmlreader.XmlDump(xmlfilename) def __iter__(self): tableTagR = re.compile(' into one single line. num = 1 while num != 0: newTable, num = re.subn("([^\r\n]{1})(<[tT]{1}[dDhHrR]{1})", r"\1\r\n\2", newTable) ################## # every open-tag gets a new line. ################## # tag with attributes, with more text on the same line newTable = re.sub("(?i)[\r\n]*?
[\w\W]*?)>(?P[\w\W]*?)[\r\n ]*", r"\r\n{| \g\r\n\g", newTable) #
tag without attributes, with more text on the same line newTable = re.sub("(?i)[\r\n]*?
(?P[\w\W]*?)[\r\n ]*", r"\r\n{|\n\g\r\n", newTable) #
tag with attributes, without more text on the same line newTable = re.sub("(?i)[\r\n]*?
[\w\W]*?)>[\r\n ]*", r"\r\n{| \g\r\n", newTable) #
tag without attributes, without more text on the same line newTable = re.sub("(?i)[\r\n]*?
[\r\n ]*", "\r\n{|\r\n", newTable) # end
newTable = re.sub("(?i)[\s]*<\/table>", "\r\n|}", newTable) ################## # caption with attributes newTable = re.sub("(?i)[\w\W]*?)>(?P[\w\W]*?)<\/caption>", r"\r\n|+\g | \g", newTable) # caption without attributes newTable = re.sub("(?i)(?P[\w\W]*?)<\/caption>", r"\r\n|+ \g", newTable) ################## # often people don't write them within , be warned! # with attributes newTable = re.sub("(?i)[\r\n]+[^>]*?)>(?P
[\w\W]*?)<\/th>", r"\r\n!\g | \g
\r\n", newTable) # fail save. sometimes people forget # without attributes, without closing newTable, n = re.subn("(?i)[\r\n]+(?P
[\w\W]*?)[\r\n]+", r"\r\n! \g
\r\n", newTable) if n>0: warning_messages.append(u'WARNING: found without . (%d occurences)\n' % n) warnings += n # with attributes, without closing newTable, n = re.subn("(?i)[\r\n]+[^>]*?)>(?P
[\w\W]*?)[\r\n]+", r"\n!\g | \g
\r\n", newTable) if n>0: warning_messages.append(u'WARNING: found without . (%d occurences\n)' % n) warnings += n ################## # with attributes newTable = re.sub("(?i)[\r\n]*[^>]*?)>[\r\n]*", r"\r\n|-----\g\r\n", newTable) # without attributes newTable = re.sub("(?i)[\r\n]*[\r\n]*", r"\r\n|-----\r\n", newTable) ################## # normal without arguments newTable = re.sub("(?i)[\r\n]+(?P[\w\W]*?)<\/td>", r"\r\n| \g\r\n", newTable) ################## # normal with arguments newTable = re.sub("(?i)[\r\n]+[^>]*?)>(?P[\w\W]*?)<\/td>", r"\r\n|\g | \g", newTable) # WARNING: this sub might eat cells of bad HTML, but most likely it # will correct errors # TODO: some more docu please newTable, n = re.subn("(?i)[\r\n]+(?P[^\r\n]*?)", r"\r\n| \g\r\n", newTable) if n>0: warning_messages.append(u' used where was expected. (%d occurences)\n' % n) warnings += n # fail save, sometimes it's a # newTable, n = re.subn("[\r\n]+<(td|TD)>([^<]*?)<(td|TD)><\/(tr|TR)>", # "\r\n| \\2\r\n", newTable) # newTable, n = re.subn("[\r\n]+<(td|TD)([^>]*?)>([^<]*?)<(td|TD)><\/(tr|TR)>", # "\r\n|\\2| \\3\r\n", newTable) # if n>0: # warning_messages.append(u'WARNING: found , but no . (%d occurences)\n' % n) # warnings += n # what is this for? newTable, n = re.subn("[\r\n]+<(td|TD)([^>]+?)>([^\r\n]*?)<\/(td|TD)>", r"\r\n|\2 | \3\r\n", newTable) if n>0: warning_messages.append(u'WARNING: (sorry, bot code unreadable (1). I don\'t know why this warning is given.) (%d occurences)\n' % n) # fail save. sometimes people forget # without arguments, with missing newTable, n = re.subn("(?i)(?P[^<]*?)[\r\n]+", r"\r\n| \g\r\n", newTable) if n>0: warning_messages.append(u'NOTE: Found without . This shouldn\'t cause problems.\n') # with arguments, with missing newTable, n = re.subn("(?i)[\r\n]*[^>]*?)>(?P[\w\W]*?)[\r\n]+", r"\r\n|\g | \g\r\n", newTable) if n > 0: warning_messages.append(u'NOTE: Found without . This shouldn\'t cause problems.\n') ################## # Garbage collecting ;-) newTable = re.sub("(?i)[\r\n]*<\/tr>", "", newTable) # delete closing tags newTable = re.sub("(?i)[\r\n]*<\/t[rdh]>", "", newTable) ################## # OK, that's only theory but works most times. # Most browsers assume that gets a new row and we do the same # newTable, n = re.subn("([\r\n]+\|\ [^\r\n]*?)([\r\n]+\!)", # "\\1\r\n|-----\\2", newTable) # warnings = warnings + n # adds a |---- below for the case the new is missing # newTable, n = re.subn("([\r\n]+\!\ [^\r\n]*?[\r\n]+)(\|\ )", # "\\1|-----\r\n\\2", newTable) # warnings = warnings + n ################## # most come with '''title'''. Senseless in my eyes cuz # should be bold anyways. newTable = re.sub("[\r\n]+\!([^'\n\r]*)'''([^'\r\n]*)'''", r"\r\n!\1\2", newTable) ################## # kills indention within tables. Be warned, it might seldom bring # bad results. # True by default. Set 'deIndentTables = False' in user-config.py if config.deIndentTables: num = 1 while num != 0: newTable, num = re.subn("(\{\|[\w\W]*?)\n[ \t]+([\w\W]*?\|\})", r"\1\r\n\2", newTable) ################## # kills additional spaces after | or ! or {| # This line was creating problems, so I commented it out --Daniel # newTable = re.sub("[\r\n]+\|[\t ]+?[\r\n]+", "\r\n| ", newTable) # kills trailing spaces and tabs newTable = re.sub("\r\n(.*)[\t\ ]+[\r\n]+", r"\r\n\1\r\n", newTable) # kill extra new-lines newTable = re.sub("[\r\n]{4,}(\!|\|)", r"\r\n\1", newTable); ################## # shortening if had no arguments/parameters newTable = re.sub("[\r\n]+\{\|[\ ]+\| ", "\r\n\{| ", newTable) # shortening if
had no articles newTable = re.sub("[\r\n]+\|[\ ]+\| ", "\r\n| ", newTable) # shortening if had no articles newTable = re.sub("\n\|\+[\ ]+\|", "\n|+ ", newTable) # shortening of
had no articles newTable = re.sub("[\r\n]+\![\ ]+\| ", "\r\n! ", newTable) ################## # proper attributes. attribute values need to be in quotation marks. num = 1 while num != 0: # group 1 starts with newlines, followed by a table or row tag # ( {| or |--- ), then zero or more attribute key - value # pairs where the value already has correct quotation marks, and # finally the key of the attribute we want to fix here. # group 2 is the value of the attribute we want to fix here. # We recognize it by searching for a string of non-whitespace characters # - [^\s]+? - which is not embraced by quotation marks - [^"] newTable, num = re.subn(r'([\r\n]+(?:\|-|\{\|)[^\r\n\|]+) *= *([^"\s>]+)', r'\1="\2"', newTable, 1) num = 1 while num != 0: # The same for header and cell tags ( ! or | ), but for these tags the # attribute part is finished by a | character. We don't want to change # cell contents which accidentially contain an equal sign. # Group 1 and 2 are anologously to the previous regular expression, # group 3 are the remaining attribute key - value pairs. newTable, num = re.subn(r'([\r\n]+(?:!|\|)[^\r\n\|]+) *= *([^"\s>]+)([^\|\r\n]*)\|', r'\1="\2"\3|', newTable, 1) ################## # merge two short
s num = 1 while num != 0: newTable, num = re.subn("[\r\n]+(\|[^\|\-\}]{1}[^\n\r]{0,35})" + "[\r\n]+(\|[^\|\-\}]{1}[^\r\n]{0,35})[\r\n]+", r"\r\n\1 |\2\r\n", newTable) #### # add a new line if first is * or # newTable = re.sub("[\r\n]+\| ([*#]{1})", r"\r\n|\r\n\1", newTable) ################## # strip
from
newTable = re.sub("([\r\n]+\![^\r\n]+?)
([\w\W]+?)<\/center>", r"\1 \2", newTable) # strip align="center" from
because the .css does it # if there are no other attributes than align, we don't need that | either newTable = re.sub("([\r\n]+\! +)align\=\"center\" +\|", r"\1", newTable) # if there are other attributes, simply strip the align="center" newTable = re.sub("([\r\n]+\![^\r\n\|]+?)align\=\"center\"([^\n\r\|]+?\|)", r"\1 \2", newTable) ################## # kill additional spaces within arguments num = 1 while num != 0: newTable, num = re.subn("[\r\n]+(\||\!)([^|\r\n]*?)[ \t]{2,}([^\r\n]+?)", r"\r\n\1\2 \3", newTable) ################## # I hate those long lines because they make a wall of letters # Off by default, set 'splitLongParagraphs = True' in user-config.py if config.splitLongParagraphs: num = 1 while num != 0: # TODO: how does this work? docu please. # why are only äöüß used, but not other special characters? newTable, num = re.subn("(\r\n[A-Z]{1}[^\n\r]{200,}?[a-zäöüß]\.)\ ([A-ZÄÖÜ]{1}[^\n\r]{200,})", r"\1\r\n\2", newTable) # show the changes for this table if self.debug: print table print newTable elif not self.quietMode: wikipedia.showDiff(table, newTable) return newTable, warnings, warning_messages def findTable(self, text): """ Finds an HTML table (which can contain nested tables) inside a text. Returns the table and the start and end position inside the text. """ start = text.find(" 0: if text.find("
", i) == -1: print "More opening than closing table tags. Skipping." return None, 0, 0 # if another table tag is opened before one is closed if text.find(" -1 and text.find("", i): i = text.find("", i) + len("") + 1 depth -= 1 end = i return text[start:end], start, end def convertAllHTMLTables(self, text): ''' Converts all HTML tables in text to wiki syntax. Returns the converted text, the number of converted tables and the number of warnings that occured. ''' convertedTables = 0 warningSum = 0 warningMessages = u'' while True: table, start, end = self.findTable(text) if not table: # no more HTML tables left break print ">> Table %i <<" % (convertedTables + 1) # convert the current table newTable, warningsThisTable, warnMsgsThisTable = self.convertTable(table) print "" warningSum += warningsThisTable for msg in warnMsgsThisTable: warningMessages += 'In table %i: %s' % (convertedTables + 1, msg) text = text[:start] + newTable + text[end:] convertedTables += 1 wikipedia.output(warningMessages) return text, convertedTables, warningSum def treat(self, pl): ''' Loads a page, converts all HTML tables in its text to wiki syntax, and saves the converted text. Returns True if the converted table was successfully saved, otherwise returns False. ''' wikipedia.output(u'\n>>> %s <<<' % pl.title()) site = pl.site() try: text = pl.get() except wikipedia.NoPage: wikipedia.output(u"ERROR: couldn't find %s" % pl.title()) return False except wikipedia.IsRedirectPage: wikipedia.output(u'Skipping redirect %s' % pl.title()) return False newText, convertedTables, warningSum = self.convertAllHTMLTables(text) if convertedTables == 0: wikipedia.output(u"No changes were necessary.") else: if config.table2wikiAskOnlyWarnings and warningSum == 0: doUpload = True else: if config.table2wikiSkipWarnings: doUpload = True else: print "There were %i replacement(s) that might lead to bad output." % warningSum doUpload = (wikipedia.input(u'Do you want to change the page anyway? [y|N]') == "y") if doUpload: # get edit summary message if warningSum == 0: wikipedia.setAction(wikipedia.translate(site.lang, msg_no_warnings)) elif warningSum == 1: wikipedia.setAction(wikipedia.translate(site.lang, msg_one_warning) % warningSum) else: wikipedia.setAction(wikipedia.translate(site.lang, msg_multiple_warnings) % warningSum) pl.put(newText) def run(self): for pl in self.generator: self.treat(pl) def main(): quietMode = False # use -quiet to get less output # if the -file argument is used, page titles are stored in this array. # otherwise it will only contain one page. articles = [] # if -file is not used, this temporary array is used to read the page title. page_title = [] debug = False xmlfilename = None textfilename = None for arg in wikipedia.handleArgs(): if arg.startswith('-file:'): if len(arg) == 5: textfilename = wikipedia.input(u'Please enter the textfile\'s name:') else: textfilename = arg[6:] gen = pagegenerators.TextfilePageGenerator(textfilename) elif arg.startswith('-xml'): if len(arg) == 4: xmlfilename = wikipedia.input(u'Please enter the XML dump\'s filename:') else: xmlfilename = arg[5:] gen = TableXmlDumpPageGenerator(xmlfilename) elif arg == '-sql': query = u""" SELECT page_namespace, page_title FROM page JOIN text ON (page_id = old_id) WHERE old_text LIKE '%