#---------- fields_stats.py----------# # Perform calculations on one or more of the # fields in a structured text database. import operator from types import * from xreadlines import xreadlines # req 2.1, but is much faster... # could use .readline() meth < 2.1 #-- Symbolic Constants DELIMITED = 1 FLATFILE = 2 #-- Some sample "statistical" func (in functional programming style) nillFunc = lambda lst: None toFloat = lambda lst: map(float, lst) avg_lst = lambda lst: reduce(operator.add, toFloat(lst))/len(lst) sum_lst = lambda lst: reduce(operator.add, toFloat(lst)) max_lst = lambda lst: reduce(max, toFloat(lst)) class FieldStats: """Gather statistics about structured text database fields text_db may be either string (incl. Unicode) or file-like object style may be in (DELIMITED, FLATFILE) delimiter specifies the field separator in DELIMITED style text_db column_positions lists all field positions for FLATFILE style, using one-based indexing (first column is 1). E.g.: (1, 7, 40) would take fields one, two, three from columns 1, 7, 40 respectively. field_funcs is a dictionary with column positions as keys, and functions on lists as values. E.g.: {1:avg_lst, 4:sum_lst, 5:max_lst} would specify the average of column one, the sum of column 4, and the max of column 5. All other cols--incl 2,3, >=6-- are ignored. """ def __init__(self, text_db='', style=DELIMITED, delimiter=',', column_positions=(1,), field_funcs={} ): self.text_db = text_db self.style = style self.delimiter = delimiter self.column_positions = column_positions self.field_funcs = field_funcs def calc(self): """Calculate the column statistics """ #-- 1st, create a list of lists for data (incl. unused flds) used_cols = self.field_funcs.keys() used_cols.sort() # one-based column naming: column[0] is always unused columns = [] for n in range(1+used_cols[-1]): # hint: '[[]]*num' creates refs to same list columns.append([]) #-- 2nd, fill lists used for calculated fields # might use a string directly for text_db if type(self.text_db) in (StringType,UnicodeType): for line in self.text_db.split('\n'): fields = self.splitter(line) for col in used_cols: field = fields[col-1] # zero-based index columns[col].append(field) else: # Something file-like for text_db for line in xreadlines(self.text_db): fields = self.splitter(line) for col in used_cols: field = fields[col-1] # zero-based index columns[col].append(field) #-- 3rd, apply the field funcs to column lists results = [None] * (1+used_cols[-1]) for col in used_cols: results[col] = \ apply(self.field_funcs[col],(columns[col],)) #-- Finally, return the result list return results def splitter(self, line): """Split a line into fields according to curr inst specs""" if self.style == DELIMITED: return line.split(self.delimiter) elif self.style == FLATFILE: fields = [] # Adjust offsets to Python zero-based indexing, # and also add final position after the line num_positions = len(self.column_positions) offsets = [(pos-1) for pos in self.column_positions] offsets.append(len(line)) for pos in range(num_positions): start = offsets[pos] end = offsets[pos+1] fields.append(line[start:end]) return fields else: raise ValueError, \ "Text database must be DELIMITED or FLATFILE" #-- Test data # First Name, Last Name, Salary, Years Seniority, Department delim = ''' Kevin,Smith,50000,5,Media Relations Tom,Woo,30000,7,Accounting Sally,Jones,62000,10,Management '''.strip() # no leading/trailing newlines # Comment First Last Salary Years Dept flat = ''' tech note Kevin Smith 50000 5 Media Relations more filler Tom Woo 30000 7 Accounting yet more... Sally Jones 62000 10 Management '''.strip() # no leading/trailing newlines #-- Run self-test code if __name__ == '__main__': getdelim = FieldStats(delim, field_funcs={3:avg_lst,4:max_lst}) print 'Delimited Calculations:' results = getdelim.calc() print ' Average salary -', results[3] print ' Max years worked -', results[4] getflat = FieldStats(flat, field_funcs={3:avg_lst,4:max_lst}, style=FLATFILE, column_positions=(15,25,35,45,52)) print 'Flat Calculations:' results = getflat.calc() print ' Average salary -', results[3] print ' Max years worked -', results[4]