# -*- coding: utf-8 -*- """ (C) 2004 Thomas R. Koll, Distributed under the terms of the MIT license. This bot downloads the HTML-pages of articles and images and saves the interesting parts, i.e. the article-text and the footer to a file like Hauptseite.txt. TODO: change the paths in the HTML-file Options: -o: Specifies the output-directory where to save the files -images: Downlaod all images -overwrite:[I|A|B] Ignore existing Images|Article|Both and download them even if the exist Features, not bugs: * Won't d/l images of an article if you set -overwrite:A """ __version__ = '$Id: saveHTML.py,v 1.11 2005/12/21 17:51:26 wikipedian Exp $' import wikipedia,httplib,StringIO,re,sys,md5,os, string from htmlentitydefs import * def extractArticle(data): """ takes a string with the complete HTML-file and returns the article which is contained in
and the pagestats which contain information on last change """ images = [] s = StringIO.StringIO(data) rPagestats = re.compile('.*(\.*\<\/span\>).*') rBody = re.compile('.*
.*') rFooter = re.compile('.*
.*') rDivOpen = re.compile('.*
.*') divLevel = 1 divLast = -1 inArticle = 0 inFooter = 0 result = {'article':"", 'footer':""} for line in s: if line == "


": continue line = line.replace(" ", " ") line = line.replace(" ", " ") if rDivOpen.match(line): divLevel = divLevel + 1 if rBody.match(line): inArticle = 1 divLast = divLevel-2 elif rFooter.match(line): divLast = divLevel-1 inFooter = 1 if inArticle: result['article'] += line elif inFooter: result['footer'] += line if rDivClose.match(line): divLevel = divLevel - 1 if divLevel == divLast: inArticle = 0 inFooter = 0 divLast = -1 return result def html2txt(str): dict = {"%C3%A4": "ä", "%C3%B6": "ö", "%C3%BC": "ü", "%C3%84": "Ä", "%C3%96": "Ö", "%C3%9C": "Ü", "%C3%9F": "ß", "%27": "'", "%28": "(", "%29": ")", "%2C": "," } for entry in dict: str = re.sub(entry, dict[entry], str) return str def extractImages(data): """ takes a string with the complete HTML-file and returns the article which is contained in
and the pagestats which contain information on last change """ images = [] rImage = re.compile('