\r\n", newTable)
if n>0:
warning_messages.append(u'NOTE: Found | without | . This shouldn\'t cause problems.\n')
# with arguments, with missing |
newTable, n = re.subn("(?i)[\r\n]*[^>]*?)>(?P[\w\W]*?)[\r\n]+",
r"\r\n|\g | \g\r\n", newTable)
if n > 0:
warning_messages.append(u'NOTE: Found | without | . This shouldn\'t cause problems.\n')
##################
# Garbage collecting ;-)
newTable = re.sub("(?i)[\r\n]*<\/tr>", "", newTable)
# delete closing tags
newTable = re.sub("(?i)[\r\n]*<\/t[rdh]>", "", newTable)
##################
# OK, that's only theory but works most times.
# Most browsers assume that | gets a new row and we do the same
# newTable, n = re.subn("([\r\n]+\|\ [^\r\n]*?)([\r\n]+\!)",
# "\\1\r\n|-----\\2", newTable)
# warnings = warnings + n
# adds a |---- below for the case the new | is missing
# newTable, n = re.subn("([\r\n]+\!\ [^\r\n]*?[\r\n]+)(\|\ )",
# "\\1|-----\r\n\\2", newTable)
# warnings = warnings + n
##################
# most | come with '''title'''. Senseless in my eyes cuz
# | should be bold anyways.
newTable = re.sub("[\r\n]+\!([^'\n\r]*)'''([^'\r\n]*)'''",
r"\r\n!\1\2", newTable)
##################
# kills indention within tables. Be warned, it might seldom bring
# bad results.
# True by default. Set 'deIndentTables = False' in user-config.py
if config.deIndentTables:
num = 1
while num != 0:
newTable, num = re.subn("(\{\|[\w\W]*?)\n[ \t]+([\w\W]*?\|\})",
r"\1\r\n\2", newTable)
##################
# kills additional spaces after | or ! or {|
# This line was creating problems, so I commented it out --Daniel
# newTable = re.sub("[\r\n]+\|[\t ]+?[\r\n]+", "\r\n| ", newTable)
# kills trailing spaces and tabs
newTable = re.sub("\r\n(.*)[\t\ ]+[\r\n]+",
r"\r\n\1\r\n", newTable)
# kill extra new-lines
newTable = re.sub("[\r\n]{4,}(\!|\|)",
r"\r\n\1", newTable);
##################
# shortening if had no arguments/parameters
newTable = re.sub("[\r\n]+\{\|[\ ]+\| ", "\r\n\{| ", newTable)
# shortening if | had no articles
newTable = re.sub("[\r\n]+\|[\ ]+\| ", "\r\n| ", newTable)
# shortening if | had no articles
newTable = re.sub("\n\|\+[\ ]+\|", "\n|+ ", newTable)
# shortening of had no articles
newTable = re.sub("[\r\n]+\![\ ]+\| ", "\r\n! ", newTable)
##################
# proper attributes. attribute values need to be in quotation marks.
num = 1
while num != 0:
# group 1 starts with newlines, followed by a table or row tag
# ( {| or |--- ), then zero or more attribute key - value
# pairs where the value already has correct quotation marks, and
# finally the key of the attribute we want to fix here.
# group 2 is the value of the attribute we want to fix here.
# We recognize it by searching for a string of non-whitespace characters
# - [^\s]+? - which is not embraced by quotation marks - [^"]
newTable, num = re.subn(r'([\r\n]+(?:\|-|\{\|)[^\r\n\|]+) *= *([^"\s>]+)',
r'\1="\2"', newTable, 1)
num = 1
while num != 0:
# The same for header and cell tags ( ! or | ), but for these tags the
# attribute part is finished by a | character. We don't want to change
# cell contents which accidentially contain an equal sign.
# Group 1 and 2 are anologously to the previous regular expression,
# group 3 are the remaining attribute key - value pairs.
newTable, num = re.subn(r'([\r\n]+(?:!|\|)[^\r\n\|]+) *= *([^"\s>]+)([^\|\r\n]*)\|',
r'\1="\2"\3|', newTable, 1)
##################
# merge two short s
num = 1
while num != 0:
newTable, num = re.subn("[\r\n]+(\|[^\|\-\}]{1}[^\n\r]{0,35})" +
"[\r\n]+(\|[^\|\-\}]{1}[^\r\n]{0,35})[\r\n]+",
r"\r\n\1 |\2\r\n", newTable)
####
# add a new line if first is * or #
newTable = re.sub("[\r\n]+\| ([*#]{1})",
r"\r\n|\r\n\1", newTable)
##################
# strip from
newTable = re.sub("([\r\n]+\![^\r\n]+?)([\w\W]+?)<\/center>",
r"\1 \2", newTable)
# strip align="center" from because the .css does it
# if there are no other attributes than align, we don't need that | either
newTable = re.sub("([\r\n]+\! +)align\=\"center\" +\|",
r"\1", newTable)
# if there are other attributes, simply strip the align="center"
newTable = re.sub("([\r\n]+\![^\r\n\|]+?)align\=\"center\"([^\n\r\|]+?\|)",
r"\1 \2", newTable)
##################
# kill additional spaces within arguments
num = 1
while num != 0:
newTable, num = re.subn("[\r\n]+(\||\!)([^|\r\n]*?)[ \t]{2,}([^\r\n]+?)",
r"\r\n\1\2 \3", newTable)
##################
# I hate those long lines because they make a wall of letters
# Off by default, set 'splitLongParagraphs = True' in user-config.py
if config.splitLongParagraphs:
num = 1
while num != 0:
# TODO: how does this work? docu please.
# why are only äöüß used, but not other special characters?
newTable, num = re.subn("(\r\n[A-Z]{1}[^\n\r]{200,}?[a-zäöüß]\.)\ ([A-ZÄÖÜ]{1}[^\n\r]{200,})",
r"\1\r\n\2", newTable)
# show the changes for this table
if self.debug:
print table
print newTable
elif not self.quietMode:
wikipedia.showDiff(table, newTable)
return newTable, warnings, warning_messages
def findTable(self, text):
"""
Finds an HTML table (which can contain nested tables) inside a text.
Returns the table and the start and end position inside the text.
"""
start = text.find("", i) == -1:
print "More opening than closing table tags. Skipping."
return None, 0, 0
# if another table tag is opened before one is closed
if text.find(" -1 and text.find("", i):
i = text.find("") + 1
depth -= 1
end = i
return text[start:end], start, end
def convertAllHTMLTables(self, text):
'''
Converts all HTML tables in text to wiki syntax.
Returns the converted text, the number of converted tables and the
number of warnings that occured.
'''
convertedTables = 0
warningSum = 0
warningMessages = u''
while True:
table, start, end = self.findTable(text)
if not table:
# no more HTML tables left
break
print ">> Table %i <<" % (convertedTables + 1)
# convert the current table
newTable, warningsThisTable, warnMsgsThisTable = self.convertTable(table)
print ""
warningSum += warningsThisTable
for msg in warnMsgsThisTable:
warningMessages += 'In table %i: %s' % (convertedTables + 1, msg)
text = text[:start] + newTable + text[end:]
convertedTables += 1
wikipedia.output(warningMessages)
return text, convertedTables, warningSum
def treat(self, pl):
'''
Loads a page, converts all HTML tables in its text to wiki syntax,
and saves the converted text.
Returns True if the converted table was successfully saved, otherwise
returns False.
'''
wikipedia.output(u'\n>>> %s <<<' % pl.title())
site = pl.site()
try:
text = pl.get()
except wikipedia.NoPage:
wikipedia.output(u"ERROR: couldn't find %s" % pl.title())
return False
except wikipedia.IsRedirectPage:
wikipedia.output(u'Skipping redirect %s' % pl.title())
return False
newText, convertedTables, warningSum = self.convertAllHTMLTables(text)
if convertedTables == 0:
wikipedia.output(u"No changes were necessary.")
else:
if config.table2wikiAskOnlyWarnings and warningSum == 0:
doUpload = True
else:
if config.table2wikiSkipWarnings:
doUpload = True
else:
print "There were %i replacement(s) that might lead to bad output." % warningSum
doUpload = (wikipedia.input(u'Do you want to change the page anyway? [y|N]') == "y")
if doUpload:
# get edit summary message
if warningSum == 0:
wikipedia.setAction(wikipedia.translate(site.lang, msg_no_warnings))
elif warningSum == 1:
wikipedia.setAction(wikipedia.translate(site.lang, msg_one_warning) % warningSum)
else:
wikipedia.setAction(wikipedia.translate(site.lang, msg_multiple_warnings) % warningSum)
pl.put(newText)
def run(self):
for pl in self.generator:
self.treat(pl)
def main():
quietMode = False # use -quiet to get less output
# if the -file argument is used, page titles are stored in this array.
# otherwise it will only contain one page.
articles = []
# if -file is not used, this temporary array is used to read the page title.
page_title = []
debug = False
xmlfilename = None
textfilename = None
for arg in wikipedia.handleArgs():
if arg.startswith('-file:'):
if len(arg) == 5:
textfilename = wikipedia.input(u'Please enter the textfile\'s name:')
else:
textfilename = arg[6:]
gen = pagegenerators.TextfilePageGenerator(textfilename)
elif arg.startswith('-xml'):
if len(arg) == 4:
xmlfilename = wikipedia.input(u'Please enter the XML dump\'s filename:')
else:
xmlfilename = arg[5:]
gen = TableXmlDumpPageGenerator(xmlfilename)
elif arg == '-sql':
query = u"""
SELECT page_namespace, page_title
FROM page JOIN text ON (page_id = old_id)
WHERE old_text LIKE '% | | | | | | | | |