#!/usr/bin/python """Convert an XML DTD to a set of SQL 'CREATE TABLE' statements Note: See http://gnosis.cx/publish/programming/xml_matters_12.txt for a detailed discussion of this module. Functions: dict2Sql(dtd_dict) parseDTD(raw_dtd): """ __shell_usage__ = """ Shell Usage: [python] dtd2sql.py [DTD] [< DTD_from_STDIN] """ __version__ = "$Revision: 0.10 $" __author__=["David Mertz (mertz@gnosis.cx)",] __thanks_to__=[] __copyright__=""" This file is released to the public domain. I (dqm) would appreciate it if you choose to keep derived works under terms that promote freedom, but obviously am giving up any rights to compel such. """ __history__=""" 0.10 Initial version. """ # Constants MYSQL = 1 DB2 = 2 ORACLE = 3 MSSQL = 4 POSTGRES = 5 import string, sys, re def dict2Sql(dtd_dict): create_table_lines = [] for key, col_list in dtd_dict.items(): stmt = "CREATE TABLE "+key stmt += " (primary_key BIGINT UNSIGNED PRIMARY KEY,seq INT UNSIGNED" for col_name in col_list: if col_name[:12] == 'foreign_key_': # foreign keys are type BIGINT stmt += ","+col_name+" BIGINT UNSIGNED" else: stmt += ","+col_name+" BLOB" # other columns are BLOBs stmt += ")" create_table_lines.append(stmt) return create_table_lines def parseDTD(raw_dtd): dtd_dict = {} for elem_def in re.findall(r"(?s)(?:)", raw_dtd): # split subelement disjunctions, quantifiers, and the like dtd_punct = ('|','(',')',',','+','*','?') norm_def = elem_def for c in dtd_punct: norm_def = norm_def.replace(c, " %s " % c) elem_split = string.split(norm_def) elem_name = elem_split[0].replace('-','__') dtd_dict[elem_name] = dtd_dict.get(elem_name,[]) if elem_split[1] == "EMPTY": # No subelements or PCDATA pass elif elem_split[2] == "#PCDATA": # Has some PCDATA dtd_dict[elem_name].append('PCDATA') if elem_split[3] == "|": # Also has subelements dtd_dict[elem_name].append('_XML') for subtag in elem_split[4:]: if subtag not in dtd_punct: safetag = subtag.replace('-','__') if dtd_dict.has_key(safetag): dtd_dict[safetag].append('foreign_key_'+elem_name) else: dtd_dict[safetag] = ['foreign_key_'+elem_name] else: # Has only subelements for subtag in elem_split[2:]: if subtag not in dtd_punct: safetag = subtag.replace('-','__') if dtd_dict.has_key(safetag): dtd_dict[safetag].append('foreign_key_'+elem_name) else: dtd_dict[safetag] = ['foreign_key_'+elem_name] for att_def in re.findall(r"(?s)(?:)", raw_dtd): # let's make sure to isolate enumerated attribute parens norm_def = att_def for c in dtd_punct: norm_def = norm_def.replace(c, " %s " % c) att_split = string.split(norm_def) elem_name = att_split[0].replace('-','__') get_col_name = 1 eat_quotes = 0 for symbol in att_split[1:]: if get_col_name: dtd_dict[elem_name].append(symbol) get_col_name = 0 # Got a column name, eat next stuff elif eat_quotes: eat_quotes -= 1 # Decrement quote-eater if not eat_quotes: # If quotes exhausted, ready for column get_col_name = 1 elif symbol in ("#IMPLIED","#REQUIRED"): get_col_name = 1 # Next symbol will be column name elif symbol == "#FIXED": eat_quotes = 2 # Must exhaust quoted fixed attribute return dtd_dict #-- Command line version of tool if __name__ == '__main__': if len(sys.argv) > 1: if sys.argv[1] in ('-h','/h','-?','/?','?','--help'): print __shell_usage__ else: raw_dtd = open(sys.argv[1]).read() for createLine in dict2Sql(parseDTD(raw_dtd)): print createLine+';' else: raw_dtd = sys.stdin.read() for createLine in dict2Sql(parseDTD(raw_dtd)): print createLine+';'