""" Transform XML Documents to Python objects

Note 0:

    See http://gnosis.cx/publish/programming/xml_matters_2.txt
    for a detailed discussion of this module.

Note 1:

    The XML-SIG distribution is changed fairly frequently while
    it is in beta versions.  The changes in turn are extremely
    likely to affect the functioning of [xml_objectify].

    This version of [xml_objectify] is believed to work with
    Python 2.0.  If fortune smiles upon us, it may also well work
    with Python 2.1+ and/or recent PyXML distributions.

    Should you have earlier PyXML distributions installed, one of
    the earlier [xml_objectify] versions might work better for
    you (possibly without other newer enhancements, however).
    Those can be found at

      http://gnosis.cx/download/xml_objectify-?.??.py

    (where the question marks are version numbers).

Note 2:

    This module is a companion to the [xml_pickle] module.
    However, the focus of each is different.  [xml_pickle] starts
    with an generic Python object, and produces a specialized XML
    document (and reads back from that custom DTD).
    [xml_objectify] starts with a generic XML document, and
    produces a somewhat specialized Python object.  Depending on
    the original and natural form of your data, one companion
    module is preferable to the other.


Usage:

    # Create a "factory object"
    xml_object = XML_Objectify('test.xml')
    # Create two different objects with recursively equal values
    py_obj1 = xml_object.make_instance()
    py_obj2 = xml_object.make_instance()

Classes:

    XML_Objectify
    _XO_
    ExpatFactory


Functions:

    keep_containers(yes_no)
    pyobj_from_dom(dom_node)
    safe_eval()
    pyobj_printer(py_obj)

"""

__version__ = "$Revision: 0.51 $"
__author__=["David Mertz (mertz@gnosis.cx)",]
__thanks_to__=["Grant Munsey (gmunsey@Adobe.COM)",
               "Costas Malamas (costas@malamas.com)",
               "Kapil Thangavelu (kvthan@wm.edu)",
               "Mario Ruggier (Mario.Ruggier@softplumbers.com)",]
__copyright__="""
    This file is released to the public domain.  I (dqm) would
    appreciate it if you choose to keep derived works under terms
    that promote freedom, but obviously am giving up any rights
    to compel such.
"""

__history__="""
    0.1    Initial version

    0.11   Minor tweaks, and improvements to pyobj_printer().
           Added 'keep_containers()' function.

    0.2    Grant Munsey pointed out my gaff in allowing ad-hoc
           contained instances (subtags) to collide with Python
           names already in use.  Fixed by name-mangling ad-hoc
           classes to form "_XO_klass" corresponding with tag
           <klass>.  Attributes still use actual tag name, e.g.,
               >>> py_obj.klass
               <xml_objectify._XO_klass instance at 165a50>

    0.21   Costas Malamas pointed out that creating a template
           class does not actually *work* to create class
           behaviors.  It is necessary to get this class into the
           xml_objectify namespace.  Generally, this will involve
           an assignment similar to:
               xml_objectify._XO_Eggs = otherscope.Eggs
           A simple example can be found at:
               http://gnosis.cx/download/xo_test.py

    0.30   Costas Malamas proposed the useful improvement of
           defining __getitem__ behavior for dynamically created
           child instances.  As a result, you can use constructs
           like:
               for myegg in spam.egg:
                   print pyobj_printer(myegg)
           without needing to worry whether spam.egg is a list of
           instances or a single instance.

    0.40   Altered by Kapil Thangavelu k_vertigo@yahoo.com to work
           with the latest version of PyXML 0.61.  Mainly syntax
           changes to reflect PyXML's move to 4DOM.

    0.45   Mario Ruggier goaded me to make xml_objectify compatible
           with Python 2.0 (his intent is presumably described
           differently :-) ).  Always optimistic, I (dqm) hope this
           will continue working with later PyXML and Python
           versions.

    0.50   Costas Malamas provided a far faster expat-based parser
           to replace the DOM-based 'pyobj_from_dom()' technique
           (orders of magnitude, with a better complexity order).
           However, when using 'ExpatFatory' to produce a
           'py_obj', there no longer remains a 'xml_obj._dom'
           attribute to refer to for element-sequence or other
           DOM information.  As well, 'ExpatFactory' does not
           collect the 'py_obj._XML' attribute that character-
           oriented markup might want preserved.

           Use of the new parser simply requires an extra (named)
           argument at 'XML_Objectify' initialization, e.g.:
               xml_obj = XML_Objectify('spam.xml',EXPAT)   # or
               xml_obj = XML_Objectify('spam.xml',DOM)     # or
               xml_obj = XML_Objectify('spam.xml',parser=EXPAT)
           Conceivably, other parsers could be added in the
           future (but probably not).  The default option is
           the backward-compatible 'DOM'.

    0.51   Minor cleanup of 0.50 changes.  Also, gave
           'keep_containers()' three states, rather than just
           two:
               NEVER:  do not store the _XML attribute
               MAYBE:  store _XML if there is char-level markup
               ALWAYS: keep _XML attribute for every element

"""

from types import *
from cStringIO import StringIO
import copy, string

#-- Node types are now class constants defined in class Node.
from xml.dom.minidom import Node
from xml.dom import minidom
DOM = 'DOM'

#-- Support expat parsing for ExpatFactory (if possible)
try:
    import xml.parsers.expat
    EXPAT = 'EXPAT'
except:
    EXPAT = None

#-- Global option to save every container tag content
KEEP_CONTAINERS = 0
ALWAYS, MAYBE, NEVER = (1,0,-1)
def keep_containers(val):
    global KEEP_CONTAINERS
    KEEP_CONTAINERS = val

#-- Base class for objectified XML nodes
class _XO_:
    def __getitem__(self, key):
        if not key:
            return self
        else:
            raise IndexError

#-- Class interface to module functionality
class XML_Objectify:
    """Factory object class for 'objectify XML document'"""
    def __init__(self, file=None, parser=DOM):
        self._parser = parser
        if type(file) == StringType:
            self._fh = open(file)
        elif type(file) == FileType:
            self._fh = file
        else:
            raise ValueError, \
                  "XML_Objectify must be initialized with filename or file handle"

        # First parsing option:  EXPAT (stream based)
        if self._parser == EXPAT:
            if not EXPAT:
                raise ImportError, "Expat parser not available"
            self.__class__.__bases__ = (ExpatFactory,)
            ExpatFactory.__init__(self)

        # Second parsing option: DOM (keeps _dom)
        elif self._parser == DOM:
            self._dom = minidom.parseString(self._fh.read())
            self._processing_instruction = {}

            for child in self._dom.childNodes:
                if child.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
                    self._processing_instruction[child.nodeName] = child.nodeValue
                elif child.nodeType == Node.ELEMENT_NODE:
                    self._root = child.nodeName
            self._PyObject = pyobj_from_dom(self._dom)

        else:
            raise ValueError, \
                  "An invalid parser was specified: %s" % self._parser

    def make_instance(self):
        if self._parser == EXPAT:
            return self.ParseFile(self._fh)
        elif self._parser == DOM:
            return copy.deepcopy(getattr(self._PyObject, self._root))
        else:
            return None

#-- expat based stream-oriented parser/objectifier
class ExpatFactory:
   def __init__(self, encoding="UTF-8", nspace_sep=" "):
      self._myparser = xml.parsers.expat.ParserCreate(encoding, nspace_sep)
      self.returns_unicode = 1

      self._current = None
      self._root    = None
      self._pcdata  = 0

      myhandlers = dir(self.__class__)
      for b in  self.__class__.__bases__:
         myhandlers.extend(dir(b))
      myhandlers = [ h for h in myhandlers if h in dir(self._myparser) \
                     if h.find('Handler') > 0 ]
      for h in myhandlers:
         exec("self._myparser.%s = self.%s" % (h, h))

   def ParseFile(self, file):
      self._myparser.returns_unicode = self.returns_unicode
      self._myparser.ParseFile(file)
      return self._root

   def Parse(self, data, isfinal=1):
      self._myparser.returns_unicode = self.returns_unicode
      self._myparser.Parse(data, isfinal)
      return self._root

   def StartElementHandler(self, name, attrs):
      # Create mangled name for current Python class and define it if need be
      pyname = py_name(name)
      klass = '_XO_' + pyname
      try:
         safe_eval(klass)
      except NameError:
         exec ('class %s(_XO_): pass' % klass)

      # Create an instance of the tag-named class
      py_obj = eval('%s()' % klass)

      # Does our current object have a child of this type already?
      if hasattr(self._current, pyname):
         # Convert a single child object into a list of children
         if type(getattr(self._current, pyname)) is not ListType:
            setattr(self._current, pyname, [getattr(self._current, pyname)])
         # Add the new subtag to the list of children
         getattr(self._current, pyname).append(py_obj)
      # Start out by creating a child object as attribute value
      else:
         # Make sure that for the first call, i.e. the root of the DOM tree,
         # we attach it to our 'product', self._root
         if not self._root:
            self._root = py_obj
         else:
            setattr(self._current, pyname, py_obj)

      # Build the attributes of the object being created
      py_obj.__dict__   = attrs
      setattr(py_obj, '__parent__', self._current)

      self._current = py_obj

   def EndElementHandler(self, name):
      self._current = self._current.__parent__

   def CharacterDataHandler(self, data):
      # Only adjust formatting if we are in a PCDATA section
      if self._pcdata:
         if hasattr(self._current, 'PCDATA'):
            self._current.PCDATA = self._current.PCDATA + data
         else:
            self._current.PCDATA = data
      else:
         # Only use "real" node contents (not bare whitespace)
         if data.strip():
            if hasattr(self._current, 'PCDATA'):
               self._current.PCDATA = self._current.PCDATA + ' ' + data.strip()
            else:
               self._current.PCDATA = data.strip()

   def StartCdataSectionHandler(self):
      self._pcdata = 1

   def EndCdataSectionHandler(self):
      self._pcdata = 0


#-- Helper functions
def pyobj_from_dom(dom_node):
    """Converts a DOM tree to a "native" Python object"""

    # does the tag-named class exist, or should we create it?
    klass = '_XO_'+py_name(dom_node.nodeName)

    try:
        safe_eval(klass)
    except NameError:
        exec ('class %s(_XO_): pass' % klass)
    # create an instance of the tag-named class
    py_obj = eval('%s()' % klass)

    # attach any tag attributes as instance attributes
    attr_dict = dom_node.attributes
    if attr_dict is None:
        attr_dict = {}
    for key in attr_dict.keys():
        setattr(py_obj, py_name(key), attr_dict[key].value)

    # for nodes with character markup, might want the literal XML
    dom_node_xml = ''
    intro_PCDATA, subtag, exit_PCDATA = (0, 0, 0)

    # now look at the actual tag contents (subtags and PCDATA)
    for node in dom_node.childNodes:
        node_name = py_name(node.nodeName)
        if KEEP_CONTAINERS > NEVER:
            dom_node_xml += node.toxml()

        # PCDATA is a kind of node, but not a new subtag
        if node.nodeName == '#text':
            if hasattr(py_obj, 'PCDATA'):
                py_obj.PCDATA += node.nodeValue
            elif string.strip(node.nodeValue):  # only use "real" node contents
                py_obj.PCDATA = node.nodeValue  # (not bare whitespace)
                if not subtag: intro_PCDATA = 1
                else: exit_PCDATA = 1

        # does a py_obj attribute corresponding to the subtag already exist?
        elif hasattr(py_obj, node_name):
            # convert a single child object into a list of children
            if type(getattr(py_obj, node_name)) is not ListType:
                setattr(py_obj, node_name, [getattr(py_obj, node_name)])
            # add the new subtag to the list of children
            getattr(py_obj, node_name).append(pyobj_from_dom(node))

        # start out by creating a child object as attribute value
        else:
            setattr(py_obj, node_name, pyobj_from_dom(node))
            subtag = 1

    # See if we want to save the literal character string of element
    if KEEP_CONTAINERS <= NEVER:
        pass
    elif KEEP_CONTAINERS >= ALWAYS:
        py_obj._XML = dom_node_xml
    else:       # if dom_node appears to contain char markup, save _XML
        if subtag and (intro_PCDATA or exit_PCDATA):
            py_obj._XML = dom_node_xml

    return py_obj

def py_name(name):
    name = string.replace(name, '#', '_')
    name = string.replace(name, ':', '_')
    name = string.replace(name, '-', '_')
    return name

def safe_eval(s):
    if 0:   # Condition for malicious string in eval() block
        raise "SecurityError", \
              "Malicious string '%s' should not be eval()'d" % s
    else:
        return eval(s)


#-- Self-test utility functions
def pyobj_printer(py_obj, level=0):
    """Return a "deep" string description of a Python object"""
    if level==0: descript = '-----* '+py_obj.__class__.__name__+' *-----\n'
    else: descript = ''
    if hasattr(py_obj, '_XML'):     # present the literal XML of object
        prettified_XML = string.join(string.split(py_obj._XML))[:50]
        descript = (' '*level)+'CONTENT='+prettified_XML+'...\n'
    else:                           # present the object hierarchy view
        for membname in dir(py_obj):
            if membname == "__parent__":
               continue             # ExpatFactory uses bookeeping attribute
            member = getattr(py_obj,membname)
            if type(member) == InstanceType:
                descript = descript+'\n'+(' '*level)+'{'+membname+'}\n'
                descript = descript + pyobj_printer(member, level+3)
            elif type(member) == ListType:
                for i in range(len(member)):
                    descript = descript+'\n'+(' '*level)+ \
                               '['+membname+'] #'+str(i+1)
                    descript = descript+(' '*level)+'\n'+ \
                               pyobj_printer(member[i],level+3)
            else:
                descript = descript+(' '*level)+membname+'='
                memval = string.join(string.split(str(member)))
                if len(memval) > 50:
                    descript = descript+memval[:50]+'...\n'
                else:
                    descript = descript+memval + '\n'
    return descript


#-- Module self-test
if __name__ == '__main__':
    import sys
    if len(sys.argv) > 1:
        for filename in sys.argv[1:]:
            xml_obj = XML_Objectify(filename)
            py_obj = xml_obj.make_instance()
            print pyobj_printer(py_obj)
    else:
        print "Please specify one or more XML files to Objectify."