#---------- find_urls.py----------# # Functions to identify and extract URLs and email addresses import re, fileinput pat_url = re.compile( r''' (?x)( # verbose identify URLs within text (http|ftp|gopher) # make sure we find a resource type :// # ...needs to be followed by colon-slash-slash (\w+[:.]?){2,} # at least two domain groups, e.g. (gnosis.)(cx) (/?| # could be just the domain name (maybe w/ slash) [^ \n\r"]+ # or stuff then space, newline, tab, quote [\w/]) # resource name ends in alphanumeric or slash (?=[\s\.,>)'"\]]) # assert: followed by white or clause ending ) # end of match group ''') pat_email = re.compile(r''' (?xm) # verbose identify URLs in text (and multiline) (?=^.{11} # Mail header matcher (?)'"\]]) # assert: followed by white or clause ending ) # end of match group ''') extract_urls = lambda s: [u[0] for u in re.findall(pat_url, s)] extract_email = lambda s: [(e[1]) for e in re.findall(pat_email, s)] if __name__ == '__main__': for line in fileinput.input(): urls = extract_urls(line) if urls: for url in urls: print fileinput.filename(),'=>',url emails = extract_email(line) if emails: for email in emails: print fileinput.filename(),'->',email