# -*- coding: utf-8 -*- """ This bot is used for checking external links from Wikipedia. It checks all external links in groups of 480 pages, gives the error code for each that causes problems, and counts the number of links with and without problems. It accepts all general Wikipediabot arguments as well as: -start:xxx Check starting at 'xxx'. -nolog Do not log to a file, only give output to a screen. Anything else is assumed to be a page that is to be checked. Spaces in page titles have to be replaced by underscores, otherwise the bot assumes the parts are separate pages. If no page has been specified and also no -start argument has been provided, the bot acts as if -start:! had been specified, starting at the beginning. The bot returns all links that have some problem, with the errorcode provided by the server, or the artificial errorcode -1 if the server could not be reached at all. Output is sent both to the screen and the file check_extern.txt """ # # (C) Andre Engels, 2004 # # Distributed under the terms of the MIT license. # __version__='$Id: check_extern.py,v 1.1 2006/01/16 20:49:19 wikipedian Exp $' import wikipedia, urllib, re, sys, httplib class URLerrorFinder(urllib.FancyURLopener): version="RobHooftWikiRobot/1.0" def open_http(self, url): """Use HTTP protocol.""" if isinstance(url, str): host, selector = urllib.splithost(url) if host: user_passwd, host = urllib.splituser(host) host = urllib.unquote(host) realhost = host else: host, selector = url urltype, rest = urllib.splittype(selector) url = rest user_passwd = None if urltype.lower() != 'http': realhost = None else: realhost, rest = splithost(rest) if realhost: user_passwd, realhost = splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) if proxy_bypass(realhost): host = realhost if not host: return -2 h = httplib.HTTP(host) h.putrequest('GET', selector) if realhost: h.putheader('Host', realhost) for args in self.addheaders: h.putheader(*args) h.endheaders() errcode, errmsg, headers = h.getreply() return errcode # Which error codes do we not consider errors? allowederrorcodes = [100,101,200,201,202,203,205,304] errname = { -1:'No contact to server', -2:'No host found', 100:'Continue', 101:'Switching Protocols', 200:'OK', 201:'Created', 202:'Accepted', 203:'Non-Authorative Information', 204:'No Content', 205:'Reset Content', 206:'Partial Content', 300:'Multiple Choices', 301:'Moved Permanently', 302:'Moved Temporarily', 303:'See Other', 304:'Not Modified', 305:'Use Proxy', 307:'Temporary Redirect', 400:'Bad Request', 401:'Unauthorized', 402:'Payment Required', 403:'Forbidden', 404:'Not Found', 405:'Method Not Allowed', 406:'None Acceptable', 407:'Proxy Authentication Required', 408:'Request Timeout', 409:'Conflict', 410:'Gone', 411:'Authorization Refused', 412:'Precondition Failed', 413:'Request Entity Too Large', 414:'Request-URI Too Large', 415:'Unsupported Media Type', 416:'Requested Range not satisfiable', 417:'Expectation Failed', 500:'Internal Server Error', 501:'Not Implemented', 502:'Bad Gateway', 503:'Service Unavailable', 504:'Gateway Timeout', 505:'HTTP Version not supported', 8181:'Certificate Expired', 12002:'Timeout', 12007:'No such host', 12029:'No connection', 12031:'Connection Reset' } def errorname(error): # Given a numerical HTML error, give its actual identity if error in errname: return errname[error] elif (error > 300) and (error < 400): return 'Unknown Redirection Response' else: return 'Unknown Error' start = '!' log = True todo = [] do_all = False for arg in sys.argv[1:]: url=sys.argv[1] arg = wikipedia.argHandler(arg, 'check_extern') if arg: if arg.startswith('-start:'): start=arg[7:] do_all=True elif arg=='-nolog': log = False else: mysite = wikipedia.getSite() todo.append(wikipedia.Page(mysite,arg)) # Make sure we have the final site mysite = wikipedia.getSite() if todo == []: # No pages have been given; if also no start is given, we start at # the beginning do_all = True if log: import logger sys.stdout = logger.Logger(sys.stdout, filename = 'check_extern.log') cont = True checked = 0 working = 0 nonworking = 0 totalchecked = 0 try: while cont: print i = 0 if len(todo)<61 and do_all: for pl in wikipedia.allpages(start = start): todo.append(pl) i += 1 if i==480: break start = todo[len(todo)-1].title() + '_0' # todo is a list of pages to do, donow are the pages we will be doing in this run. if len(todo)>60: # Take the first 60. donow = todo[0:60] todo = todo[60:] else: donow = todo # If there was more to do, the 'if len(todo)<61' part would have extended # todo beyond this size. cont = False try: wikipedia.getall(mysite, donow) except wikipedia.SaxError: # Ignore this error, and get the pages the traditional way. pass checked +=len(donow) for pl in donow: R = re.compile(r'http://[^\s}<\]]+[^\s.,:;)\?!\]}<]') try: for url in R.findall(pl.get()): url = wikipedia.unicode2html(url,'ascii') try: error = URLerrorFinder().open(url) except IOError: error = -1 if error in allowederrorcodes: working += 1 else: nonworking += 1 print wikipedia.output(u'Page "%s" links to:'%pl.title()) wikipedia.output(url) wikipedia.output(u'Which gave error: %s %s'%(error,errorname(error))) # If anything is wrong with the Wikipedia page, just ignore except (wikipedia.NoPage,wikipedia.IsRedirectPage,wikipedia.LockedPage): pass if checked>499 or not cont: totalchecked += 500 checked -= 500 print print '======================================================================' wikipedia.output(u'%s pages checked, last was [[%s]]'%(totalchecked+checked,donow[len(donow)-1])) print 'In those pages there were %s correct and %s problematic external links.'%(working,nonworking) except: wikipedia.stopme() raise wikipedia.stopme()