# -*- coding: utf-8 -*- """ Library to work with category pages on Wikipedia """ # # (C) Rob W.W. Hooft, Andre Engels, 2004 # # Distributed under the terms of the MIT license. # __version__ = '$Id: catlib.py,v 1.63 2006/03/10 11:56:03 a_engels Exp $' # import re, time import wikipedia msg_created_for_renaming = { 'de':u'Bot: Verschoben von %s. Autoren: %s', 'en':u'Robot: Moved from %s. Authors: %s', 'ia':u'Robot: Transferite de %s. Autores: %s', 'fr':u'Robot : déplacé depuis %s. Auteurs: %s', } def isCatTitle(title, site): return ':' in title and title[:title.index(':')] in site.category_namespaces() def unique(l): """Given a list of hashable object, return an alphabetized unique list. """ l=dict.fromkeys(l).keys() l.sort() return l class Category(wikipedia.Page): """Subclass of Page that has some special tricks that only work for category: pages""" def __init__(self, site, title = None, insite = None, tosite = None, sortKey = None): wikipedia.Page.__init__(self, site = site, title = title, insite = insite, tosite = tosite) self.sortKey = sortKey if self.namespace() != 14: raise ValueError(u'BUG: %s is not in the category namespace!' % title) def aslink(self): """ A string representation in the form of a link. This method is different from Page.aslink() as the link will never have the form of an an interwiki link. """ if self.sortKey: titleWithSortKey = '%s|%s' % (self.title(), self.sortKey) else: titleWithSortKey = self.title() return '[[%s]]' % titleWithSortKey def catlist(self, recurse = False, purge = False): """Cache result of make_catlist for a second call This should not be used outside of this module. """ if purge: self._catlistT = self._make_catlist(recurse = recurse, purge = True) # if we don't have a cached version elif not hasattr(self, '_catlistT'): self._catlistT = self._make_catlist(recurse = recurse) return self._catlistT def _make_catlist(self, recurse = False, purge = False, site = None): """Make a list of all articles and categories that are in this category. If recurse is set to True, articles and subcategories of any subcategories are also retrieved. Returns non-unique, non-sorted lists of articles, subcategories and supercategories. The supercategory list only contains the supercategories of this category, regardless of the recurse argument. This should not be used outside of this module. """ if site is None: site = self.site() import re if site.version() < "1.4": Rtitle = re.compile('title\s?=\s?\"([^\"]*)\"') else: Rtitle = re.compile('/wiki/\S* title\s?=\s?\"([^\"]*)\"') ns = site.category_namespaces() catsdone = [] catstodo = [self] articles = [] subcats = [] supercats=[] # regular expression matching the "(next 200)" link RLinkToNextPage = re.compile('&from=(.*?)" title="'); while catstodo: cat = catstodo.pop() catsdone.append(cat) # if category list is split up into several pages, this variable # stores where the next list page should start startFromPage = None thisCatDone = False # This loop will run until all list pages of the current category # have been read. Note: supercategories are displayed equally on # each of the list pages, so we will care about them after this # loop. while not thisCatDone: path = site.get_address(cat.urlname()) if startFromPage: path += '&from=' + startFromPage if purge: path += '&action=purge' if startFromPage: wikipedia.output('Getting [[%s]] starting at %s...' % (cat.title(), startFromPage)) else: wikipedia.output('Getting [[%s]]...' % cat.title()) txt = site.getUrl(path) # save a copy of this text to find out self's supercategory. # if recurse is true, this function should only return self's # supercategory, not the ones of its subcats. self_txt = txt # index where subcategory listing begins # this only works for the current version of the MonoBook skin ibegin = txt.index('"clear:both;"') # index where article listing ends try: iend = txt.index('