From 8ead922b4c6cba3b5bb68a8d0fbf38fe35d4cff9 Mon Sep 17 00:00:00 2001 From: James Turk Date: Tue, 4 Nov 2008 17:20:10 +0000 Subject: [PATCH] added flatten & flattener --- examples/fec_cobol.py | 6 +++--- saucebrush/filters.py | 33 +++++++++++++++++++++++++++++++++ saucebrush/utils.py | 22 ++++++++++++++++++++++ 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/examples/fec_cobol.py b/examples/fec_cobol.py index 8330909..1a2221a 100644 --- a/examples/fec_cobol.py +++ b/examples/fec_cobol.py @@ -45,21 +45,21 @@ def process_fec_year(year): source = FixedWidthFileSource(open('%s/foiacm.dta' % year), CM_FIELDS) #sqlite = SqliteOutput('fec%s.sqlite' % year, 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler']) emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler']) - run_recipe(source, [emit_mysql]) + run_recipe(source, emit_mysql) # candidate source = FixedWidthFileSource(open('%s/foiacn.dta' % year), CN_FIELDS) fieldremover = FieldRemover(('fillerA', 'fillerB')) #sqlite = SqliteOutput('fec%s.sqlite' % year, 'candidate', [f[0] for f in CN_FIELDS if f[0] != 'filler']) emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'candidate', [f[0] for f in CN_FIELDS if not f[0].startswith('filler')]) - run_recipe(source, [fieldremover, emit_mysql]) + run_recipe(source, fieldremover, emit_mysql) # contributions source = FixedWidthFileSource(open('%s/itcont.dta' % year), INDIV_FIELDS) decobolizer = FieldModifier(('amount', ), fix_cobol_number) #sqlite = SqliteOutput('fec%s.sqlite' % year, 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler']) emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler']) - run_recipe(source, [decobolizer, emit_mysql]) + run_recipe(source, decobolizer, emit_mysql) if __name__=='__main__': process_fec_year(2008) diff --git a/saucebrush/filters.py b/saucebrush/filters.py index f56e14a..80fd61c 100644 --- a/saucebrush/filters.py +++ b/saucebrush/filters.py @@ -8,6 +8,7 @@ """ from exceptions import NotImplementedError +from saucebrush import utils ###################### ## Abstract Filters ## @@ -156,6 +157,9 @@ class FieldAdder(Filter): from itertools import count FieldAdder('id', count) + + would yield a new column named id that uses the itertools count iterable + to create sequentially numbered ids. """ def __init__(self, field_name, field_value): @@ -214,6 +218,35 @@ class Splitter(Filter): return record +class Flattener(Filter): + """ Collapse a set of similar dictionaries into a list. + + Takes a dictionary of keys and flattens the key names: + + addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}}, + {'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}] + flattener = Flattener(['addresses']) + """ + + + def __init__(self): + super(Flattener, self).__init__() + + '''def process_field(self, item): + # create a list of dictionaries with concatenated keys + retlist = [] + for subitem in item: + newitem = {} + for key1,subdict in subitem.iteritems(): + for key2,value in subdict.iteritems(): + newitem[key1+'_'+key2] = value + retlist.append(newitem) + return retlist + ''' + + def process_record(self, record): + return utils.flatten(record) + ########################### ## Commonly Used Filters ## diff --git a/saucebrush/utils.py b/saucebrush/utils.py index c8140ac..42974f7 100644 --- a/saucebrush/utils.py +++ b/saucebrush/utils.py @@ -32,6 +32,28 @@ def string_dig(element, joiner=''): return joiner.join([string_dig(child) for child in element.findAll(True)]) +def flatten(item, prefix=''): + """ + Flatten nested dictionary into one with its keys concatenated together. + + >>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}], + 'f':{'g':{'h':6}}}) + {'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6} + """ + if isinstance(item, dict): + # don't prepend a leading _ + if prefix != '': + prefix += '_' + retval = {} + for key, value in item.iteritems(): + retval.update(flatten(value, prefix + key)) + return retval + elif isinstance(item, (tuple, list)): + return {prefix: [flatten(i) for i in item]} + else: + return {prefix: item} + + def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'): """ Do a lookup within dict_ by the various elements of dotted_key.