#!/usr/bin/python # -*- coding: UTF-8 -*- """ This bot spellchecks Wikipedia pages. It is very simple, only checking whether a word, stripped to its 'essence' is in the list or not, it does not do any grammar checking or such. It can be used in two ways: spellcheck.py Title Check a single page; after this the bot will ask whether you want to check another page spellcheck.py -start:Title Go through the wiki, starting at title 'Title'. spellcheck.py -newpages Go through the pages on [[Special:Newpages]] spellcheck.py -longpages Go through the pages on [[Special:Longpages]] For each unknown word, you get a couple of options: numbered options: replace by known alternatives a: This word is correct; add it to the list of known words c: The uncapitalized form of this word is correct; add it i: Do not edit this word, but do also not add it to the list r: Replace the word, and add the replacement as a known alternative s: Replace the word, but do not add the replacement *: Edit the page using the gui g: Give a list of 'guessed' words, which are similar to the given one x: Ignore this word, and do not check the rest of the page When the bot is ended, it will save the extensions to its word list; there is one word list for each language. The bot does not rely on Latin script, but does rely on Latin punctuation. It is therefore expected to work on for example Russian and Korean, but not on for example Japanese. Command-line options: -html change HTML-entities like ü into their respective letters. This is done both before and after the normal check. -rebuild save the complete wordlist, not just the changes, removing the old wordlist. """ # # (C) Andre Engels, 2005 # # Distributed under the terms of the MIT license. # __version__ = '$Id: spellcheck.py,v 1.37 2006/03/07 00:23:33 wikipedian Exp $' import re,sys import wikipedia import string,codecs msg={ 'en':u'Bot-aided spell checker', 'es':u'Bot asistido de correción ortográfica', 'fr':u'Correction orthographique par robot', 'ia':u'Correction de orthographia per robot', 'nl':u'Spellingscontrole', 'pt':u'Bot de correção ortográfica', } class SpecialTerm(object): def __init__(self,text): self.style = text def makepath(path): """ creates missing directories for the given path and returns a normalized absolute version of the path. - if the given path already exists in the filesystem the filesystem is not modified. - otherwise makepath creates directories along the given path using the dirname() of the path. You may append a '/' to the path if you want it to be a directory path. from holger@trillke.net 2002/03/18 """ from os import makedirs from os.path import normpath,dirname,exists,abspath dpath = normpath(dirname(path)) if not exists(dpath): makedirs(dpath) return normpath(abspath(path)) def distance(a,b): # Calculates the Levenshtein distance between a and b. # That is, the number of edits needed to change one into # the other, where one edit is the addition, removal or # change of a single character. # Copied from Magnus Lie Hetland at http://hetland.org/python/ n, m = len(a), len(b) if n > m: # Make sure n <= m, to use O(min(n,m)) space a,b = b,a n,m = m,n current = range(n+1) for i in range(1,m+1): previous, current = current, [i]+[0]*m for j in range(1,n+1): add, delete = previous[j]+1, current[j-1]+1 change = previous[j-1] if a[j-1] != b[i-1]: change = change + 1 current[j] = min(add, delete, change) return current[n] def getalternatives(string): # Find possible correct words for the incorrect word string basetext = wikipedia.input(u"Give a text that should occur in the words to be checked.\nYou can choose to give no text, but this will make searching slow:") basetext = basetext.lower() simwords = {} for i in xrange(11): simwords[i] = [] for alt in knownwords.keys(): if basetext: if alt.lower().find(basetext) == -1: dothis = False else: dothis = True else: dothis = True if dothis: diff = distance(string,alt) if diff < 11: if knownwords[alt] == alt: simwords[diff] += [alt] else: simwords[diff] += knownwords[alt] posswords = [] for i in xrange(11): posswords += simwords[i] return posswords[:30] def uncap(string): # uncapitalize the first word of the string return string[0].lower()+string[1:] def cap(string): # uncapitalize the first word of the string return string[0].upper()+string[1:] def askAlternative(word,context=None): correct = None wikipedia.output(u"="*60) wikipedia.output(u"Found unknown word '%s'"%word) if context: wikipedia.output(u"Context:") wikipedia.output(u""+context) wikipedia.output(u"-"*60) while not correct: for i in xrange(len(Word(word).getAlternatives())): wikipedia.output(u"%s: Replace by '%s'"%(i+1,Word(word).getAlternatives()[i].replace('_',' '))) wikipedia.output(u"a: Add '%s' as correct"%word) if word[0].isupper(): wikipedia.output(u"c: Add '%s' as correct"%(uncap(word))) wikipedia.output(u"i: Ignore once") wikipedia.output(u"r: Replace text") wikipedia.output(u"s: Replace text, but do not save as alternative") wikipedia.output(u"g: Guess (give me a list of similar words)") wikipedia.output(u"*: Edit by hand") wikipedia.output(u"x: Do not check the rest of this page") answer = wikipedia.input(u":") if answer in "aAiI": correct = word if not answer in "iI": knownwords[word] = word newwords.append(word) elif answer in "rRsS": correct = wikipedia.input(u"What should I replace it by?") if answer in "rR": if correct_html_codes: correct = removeHTML(correct) if correct != cap(word) and correct != uncap(word) and correct != word: try: knownwords[word] += [correct.replace(' ','_')] except KeyError: knownwords[word] = [correct.replace(' ','_')] newwords.append(word) knownwords[correct] = correct newwords.append(correct) elif answer in "cC" and word[0].isupper(): correct = word knownwords[uncap(word)] = uncap(word) newwords.append(uncap(word)) elif answer in "gG": possible = getalternatives(word) if possible: print "Found alternatives:" for pos in possible: print " %s"%pos else: print "No similar words found." elif answer=="*": correct = edit elif answer=="x": correct = endpage else: for i in xrange(len(Word(word).getAlternatives())): if answer == str(i+1): correct = Word(word).getAlternatives()[i].replace('_',' ') return correct def removeHTML(page): # TODO: Consider removing this; this stuff can be done by cosmetic_changes.py result = page result = result.replace('Ä',u'Ä') result = result.replace('ä',u'ä') result = result.replace('Ë',u'Ë') result = result.replace('ë',u'ë') result = result.replace('Ï',u'Ï') result = result.replace('ï',u'ï') result = result.replace('Ö',u'Ö') result = result.replace('ö',u'ö') result = result.replace('Ü',u'Ü') result = result.replace('ü',u'ü') result = result.replace('Á',u'Á') result = result.replace('á',u'á') result = result.replace('É',u'É') result = result.replace('é',u'é') result = result.replace('Í',u'Í') result = result.replace('í',u'í') result = result.replace('Ó',u'Ó') result = result.replace('ó',u'ó') result = result.replace('Ú',u'Ú') result = result.replace('ú',u'ú') result = result.replace('À',u'À') result = result.replace('à',u'à') result = result.replace('È',u'È') result = result.replace('è',u'è') result = result.replace('Ì',u'Ì') result = result.replace('ì',u'ì') result = result.replace('Ò',u'Ò') result = result.replace('ò',u'ò') result = result.replace('Ù',u'Ù') result = result.replace('ù',u'ù') result = result.replace('Â',u'Â') result = result.replace('â',u'â') result = result.replace('Ê',u'Ê') result = result.replace('ê',u'ê') result = result.replace('Î',u'Î') result = result.replace('î',u'î') result = result.replace('Ô',u'Ô') result = result.replace('ô',u'ô') result = result.replace('Û',u'Û') result = result.replace('û',u'û') result = result.replace('Å',u'Å') result = result.replace('å',u'å') result = result.replace('°',u'°') return result def spellcheck(page): text = page if correct_html_codes: text = removeHTML(text) loc = 0 while True: wordsearch = re.compile(r'([\s\=\<\>\_]*)([^\s\=\<\>\_]+)') match = wordsearch.search(text,loc) if not match: # No more words on this page break loc += len(match.group(1)) bigword = Word(match.group(2)) smallword = bigword.derive() if not Word(smallword).isCorrect(): replacement = askAlternative(smallword,context=text[max(0,loc-40):loc+len(match.group(2))+40]) if replacement == edit: import editarticle editor = editarticle.TextEditor() # TODO: Don't know to which index to jump newtxt = editor.edit(text, jumpIndex = 0, highlight = smallword) if newtxt: text = newtxt elif replacement == endpage: loc = len(text) else: replacement = bigword.replace(replacement) text = text[:loc] + replacement + text[loc+len(match.group(2)):] loc += len(replacement) else: loc += len(match.group(2)) if correct_html_codes: text = removeHTML(text) return text class Word(object): def __init__(self,text): self.word = text def derive(self): # Get the short form of the word, without punctuation, square # brackets etcetera shortword = self.word # Remove all words of the form [[something:something - these are # usually interwiki links or category links if shortword.rfind(':') != -1: if -1 < shortword.rfind('[[') < shortword.rfind(':'): shortword = "" # Remove barred links if shortword.rfind('|') != -1: if -1 < shortword.rfind('[[') < shortword.rfind('|'): shortword = shortword[:shortword.rfind('[[')] + shortword[shortword.rfind('|')+1:] else: shortword = shortword[shortword.rfind('|')+1:] shortword = shortword.replace('[','') shortword = shortword.replace(']','') # Remove non-alphanumerical characters at the start try: while shortword[0] in string.punctuation: shortword=shortword[1:] except IndexError: return "" # Remove non-alphanumerical characters at the end; no need for the # try here because if things go wrong here, they should have gone # wrong before while shortword[-1] in string.punctuation: shortword=shortword[:-1] # Do not check URLs if shortword.startswith("http://"): shortword="" # Do not check 'words' with only numerical characters number = True for i in xrange(len(shortword)): if not (shortword[i] in string.punctuation or shortword[i] in string.digits): number = False if number: shortword = "" return shortword def replace(self,rep): # Replace the short form by 'rep'. Keeping simple for now - if the # short form is part of the long form, replace it. If it is not, ask # the user if rep == self.derive(): return self.word if self.word.find(self.derive()) == -1: return wikipedia.input(u"Please give the result of replacing %s by %s in %s:"%(self.derive(),rep,self.word)) return self.word.replace(self.derive(),rep) def isCorrect(self): if self.word == "": return True try: if knownwords[self.word] == self.word: return True except KeyError: pass if self.word[0].isupper(): return Word(uncap(self.word)).isCorrect() else: return False def getAlternatives(self): alts = [] if self.word[0].islower(): if Word(cap(self.word)).isCorrect(): alts = [cap(self.word)] try: alts += knownwords[self.word] except KeyError: pass return alts def declare_correct(self): knownwords[self.word] = self.word def declare_alternative(self,alt): if not alt in knownwords[self.word]: knownwords[self.word].append(word) newwords.append(self.word) return self.alternatives try: edit = SpecialTerm("edit") endpage = SpecialTerm("end page") title = [] knownwords = {} newwords = [] start = None newpages = False longpages = False correct_html_codes = False rebuild = False for arg in sys.argv[1:]: arg = wikipedia.argHandler(arg, 'spellcheck') if arg: if arg.startswith("-start:"): start = arg[7:] elif arg.startswith("-newpages"): newpages = True elif arg.startswith("-longpages"): longpages = True elif arg.startswith("-html"): correct_html_codes = True elif arg.startswith("-rebuild"): rebuild = True else: title.append(arg) mysite = wikipedia.getSite() wikipedia.setAction(wikipedia.translate(mysite,msg)) filename = 'spelling/spelling-' + mysite.language() + '.txt' print "Getting wordlist" try: f = codecs.open(makepath(filename), 'r', encoding = mysite.encoding()) for line in f.readlines(): # remove trailing newlines and carriage returns try: while line[-1] in ['\n', '\r']: line = line[:-1] except IndexError: pass #skip empty lines if line != '': if line[0] == "1": word = line[2:] knownwords[word] = word else: line = line.split(' ') word = line[1] knownwords[word] = line[2:] f.close() except IOError: print "Warning! There is no wordlist for your language!" else: print "Wordlist successfully loaded." except: wikipedia.stopme() raise try: if newpages: for (page, date, length, loggedIn, user, comment) in wikipedia.getSite().newpages(1000): try: text = page.get() except wikipedia.Error: pass else: text = spellcheck(text) if text != page.get(): page.put(text) elif start: for page in mysite.allpages(start = start): try: text = page.get() except wikipedia.Error: pass else: text = spellcheck(text) if text != page.get(): page.put(text) if longpages: for (page, length) in wikipedia.getSite().longpages(500): try: text = page.get() except wikipedia.Error: pass else: text = spellcheck(text) if text != page.get(): page.put(text) else: title = ' '.join(title) while title != '': try: page = wikipedia.Page(mysite,title) text = page.get() except wikipedia.NoPage: print "Page does not exist." except wikipedia.IsRedirectPage: print "Page is a redirect page" else: text = spellcheck(text) if text != page.get(): page.put(text) title = wikipedia.input(u"Which page to check now? (enter to stop)") finally: wikipedia.stopme() filename = 'spelling/spelling-' + mysite.language() + '.txt' if rebuild: list = knownwords.keys() list.sort() f = codecs.open(makepath(filename), 'w', encoding = mysite.encoding()) else: list = newwords f = codecs.open(makepath(filename), 'a', encoding = mysite.encoding()) for word in list: if Word(word).isCorrect(): f.write("1 %s\n"%word) else: f.write("0 %s %s\n"%(word," ".join(knownwords[word]))) f.close()