# -*- coding: iso8859-1 -*- """ Bot for getting multiple images from an external site. It takes a URL as an argument and finds all images (and other files specified by the extensions in 'fileformats') that URL is referring to, asking whether to upload them. If further arguments are given, they are considered to be the text that is common to the descriptions. A second use is to get a number of images that have URLs only differing in numbers. To do this, use the command line option "-pattern", and give the URL with the variable part replaced by '$' (if that character occurs in the URL itself, you will have to change the bot code, my apologies). Other options: -shown Choose images shown on the page as well as linked from it -justshown Choose _only_ images shown on the page, not those linked """ __version__='$Id: imageharvest.py,v 1.11 2006/01/26 19:06:21 leogregianin Exp $' import re, sys, os import wikipedia, upload def get_imagelinks(url): # Given a URL, get all images linked to by the page at that URL. # First, we get the location for relative links from the URL. relativepath = url.split("/") if len(relativepath) == 1: relativepath=relativepath[0] else: relativepath=relativepath[:len(relativepath)-1] relativepath="/".join(relativepath) links = [] uo = wikipedia.MyURLopener() file = uo.open(url) text = file.read() file.close() text = text.lower() if not shown: R=re.compile("href\s*=\s*[\"'](.*?)[\"']") elif shown == "just": R=re.compile("src\s*=s*[\"'](.*?)[\"']") else: R=re.compile("[\"'](.*?)[\"']") for link in R.findall(text): ext = os.path.splitext(link)[1].lower().strip('.') if ext in fileformats: if re.compile("://").match(text): links += [link] else: links += [relativepath+"/"+link] return links def main(give_url,image_url): url = give_url basicdesc = [] if url == '': if image_url: url = wikipedia.input(u"What URL range should I check (use $ for the part that is changeable)") else: url = wikipedia.input(u"From what URL should I get the images?") if image_url: minimum=1 maximum=99 answer= wikipedia.input(u"What is the first number to check (default: 1)") if answer: minimum=int(answer) answer= wikipedia.input(u"What is the last number to check (default: 99)") if answer: maximum=int(answer) if basicdesc == []: basicdesc = wikipedia.input( u"What text should be added at the end of the description of each image from this url?") else: basicdesc = ' '.join(desc) if image_url: ilinks = [] i = minimum while i <= maximum: ilinks += [url.replace("$",str(i))] i += 1 else: ilinks = get_imagelinks(url) for image in ilinks: answer = wikipedia.inputChoice(u'Include image %s?' % image, ['yes', 'no', 'stop'], ['y', 'N', 's'], 'N') if answer in ["y","Y"]: desc = wikipedia.input(u"Give the description of this image:") desc = desc + "\r\n\n\r" + basicdesc uploadBot = upload.UploadRobot(image, desc) uploadBot.run() elif answer in ["s","S"]: break try: url = u'' image_url = False shown = False for arg in sys.argv[1:]: arg = wikipedia.argHandler(arg, 'imageharvest') if arg: if arg == "-pattern": image_url = True elif arg == "-shown": shown = True elif arg == "-justshown": shown = "just" elif url == u'': url = arg else: desc += [arg] fileformats = ('jpg', 'jpeg', 'png', 'gif', 'svg', 'ogg') mysite = wikipedia.getSite() main(url,image_url) finally: wikipedia.stopme()