#!/usr/bin/env python

"""Create full-text XPATH indexes of XML documents

Notes:

  See http://gnosis.cx/publish/programming/xml_matters_10.txt
  for a detailed discussion of this module.

  To make this module work, you will want the following:

    [indexer] module:
      http://gnosis.cx/download/indexer.py

    [xml_objectify] module:
      http://gnosis.cx/download/xml_objectify.py

Classes:

  XML_Indexer(PreferredIndexer, TextSplitter)

"""
__shell_usage__ = """
Shell Usage: [python] xml_indexer.py [options] [xml_file_to_index]

    -h, /h, -?, /?, ?, --help:    Show this help screen
"""
__version__ = "$Revision: 0.10 $"
__author__=["David Mertz (mertz@gnosis.cx)",]
__thanks_to__=[]
__copyright__="""
    This file is released to the public domain.  I (dqm) would
    appreciate it if you choose to keep derived works under terms
    that promote freedom, but obviously am giving up any rights
    to compel such.
"""
__history__="""
    0.05   Pre-initial "proof-of-concept"

    0.1    Initial version.
"""

from types import *

# Names from xml_objectify (and configure xml_objectify)
from xml_objectify import XML_Objectify, keep_containers
from xml_objectify import ALWAYS, MAYBE, NEVER, DOM, EXPAT
keep_containers(MAYBE)

# Which concrete indexer should serve as the base for XML_Indexer?
import indexer
PreferredIndexer = indexer.SlicedZPickleIndexer

# Create a class to implement XPATH-like indexing
class XML_Indexer(PreferredIndexer, indexer.TextSplitter):
    """Concrete Indexer for XML-as-hierarchical-filesystem
    """
    def add_file(self, fname):
        "Index the nodes of an XML file"
        # Read in the file (if possible)
        try:
            py_obj = XML_Objectify(fname, EXPAT).make_instance()
            if not py_obj:      # Fallback to DOM where Expat has problems
                raise "BadPaserError"
                py_obj = XML_Objectify(fname, DOM).make_instance()
            if self.quiet < 5: print "Indexing", fname
        except IOError:
            return 0
        self.fname_prefix = fname
        self.recurse_nodes(py_obj)

    def recurse_nodes(self, currnode, xpath_suffix=""):
        "Recurse and process nodes in XML file"
        if hasattr(currnode, '_XML'):   # maybe present literal XML of object
            text = currnode._XML.encode('UTF-8')
            self.add_nodetext(text, xpath_suffix)
        else:
            for membname in dir(currnode):
                if membname == "__parent__":
                   continue             # ExpatFactory uses bookeeping attribute
                member = getattr(currnode, membname)
                if type(member) is InstanceType:
                    xpath = xpath_suffix+'/'+membname
                    self.recurse_nodes(member, xpath.encode('UTF-8'))
                elif type(member) is ListType:
                    for i in range(len(member)):
                        xpath = xpath_suffix+'/'+membname+'['+str(i+1)+']'
                        self.recurse_nodes(member[i], xpath.encode('UTF-8'))
                elif type(member) is StringType:
                    if membname != 'PCDATA':
                        xpath = xpath_suffix+'/@'+membname
                        self.add_nodetext(member, xpath.encode('UTF-8'))
                    else:
                        self.add_nodetext(member, xpath_suffix.encode('UTF-8'))
                elif type(member) is UnicodeType:
                    if membname != 'PCDATA':
                        xpath = xpath_suffix+'/@'+membname
                        self.add_nodetext(member.encode('UTF-8'),
                                          xpath.encode('UTF-8'))
                    else:
                        self.add_nodetext(member.encode('UTF-8'),
                                          xpath_suffix.encode('UTF-8'))
                else:
                    raise TypeError, \
                          "Unsupported Node Type: "+`type(member)`+`member`

    def add_nodetext(self, text, xpath_suffix):
        "Add the node PCDATA to index, using full XPATH to node as key"
        node_id = self.fname_prefix+'::'+xpath_suffix
        words = self.splitter(text, 'text/plain')

        # Find new node index, and assign it to node_id
        # (_TOP uses trick of negative to avoid conflict with file index)
        self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
        node_index =  abs(self.files['_TOP'][0])
        self.files[node_id] = (node_index, len(words))

        for word in words:
            if self.words.has_key(word):
                entry = self.words[word]
            else:
                entry = {}
            if entry.has_key(node_index):
                entry[node_index] = entry[node_index]+1
            else:
                entry[node_index] = 1
            self.words[word] = entry

#-- If called from command-line, parse arguments and take actions
if __name__ == '__main__':
    import os,sys
    ndx = XML_Indexer()
    ndx.load_index()
    if len(sys.argv)==1:
        if sys.argv[1]opt in ('-h','/h','-?','/?','?','--help'):   # help screen
            print __shell_usage__
        else:
            sys.stderr.write("Perhaps you would like to use the --help option?\n")
    else:
        ndx.add_files(dir=os.getcwd(),pattern=sys.argv[1])
        ndx.save_index()