#---------- find_urls.py----------#
# Functions to identify and extract URLs and email addresses

import re, fileinput

pat_url = re.compile(  r'''
                 (?x)( # verbose identify URLs within text
     (http|ftp|gopher) # make sure we find a resource type
                   :// # ...needs to be followed by colon-slash-slash
        (\w+[:.]?){2,} # at least two domain groups, e.g. (gnosis.)(cx)
                  (/?| # could be just the domain name (maybe w/ slash)
            [^ \n\r"]+ # or stuff then space, newline, tab, quote
                [\w/]) # resource name ends in alphanumeric or slash
     (?=[\s\.,>)'"\]]) # assert: followed by white or clause ending
                     ) # end of match group
                       ''')
pat_email = re.compile(r'''
                (?xm)  # verbose identify URLs in text (and multiline)
             (?=^.{11} # Mail header matcher
     (?<!Message-ID:|  # rule out Message-ID's as best possible
         In-Reply-To)) # ...and also In-Reply-To
                (.*?)( # must grab to email to allow prior lookbehind
    ([A-Za-z0-9-]+\.)? # maybe an initial part: DAVID.mertz@gnosis.cx
         [A-Za-z0-9-]+ # definitely some local user: MERTZ@gnosis.cx
                     @ # ...needs an at sign in the middle
          (\w+\.?){2,} # at least two domain groups, e.g. (gnosis.)(cx)
     (?=[\s\.,>)'"\]]) # assert: followed by white or clause ending
                     ) # end of match group
                       ''')
extract_urls = lambda s: [u[0] for u in re.findall(pat_url, s)]
extract_email = lambda s: [(e[1]) for e in re.findall(pat_email, s)]

if __name__ == '__main__':
    for line in fileinput.input():
        urls = extract_urls(line)
        if urls:
            for url in urls:
                print fileinput.filename(),'=>',url
        emails = extract_email(line)
        if emails:
            for email in emails:
                print fileinput.filename(),'->',email