added flatten & flattener

2008-11-04 17:20:10 +00:00 · 2008-11-04 17:20:10 +00:00 · 8ead922b4c
commit 8ead922b4c
parent 31ffeaff61
3 changed files with 58 additions and 3 deletions
--- a/examples/fec_cobol.py
+++ b/examples/fec_cobol.py
@ -45,21 +45,21 @@ def process_fec_year(year):
    source = FixedWidthFileSource(open('%s/foiacm.dta' % year), CM_FIELDS)
    #sqlite = SqliteOutput('fec%s.sqlite' % year, 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler'])
    emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler'])
-    run_recipe(source, [emit_mysql])
+    run_recipe(source, emit_mysql)

    # candidate
    source = FixedWidthFileSource(open('%s/foiacn.dta' % year), CN_FIELDS)
    fieldremover = FieldRemover(('fillerA', 'fillerB'))
    #sqlite = SqliteOutput('fec%s.sqlite' % year, 'candidate', [f[0] for f in CN_FIELDS if f[0] != 'filler'])
    emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'candidate', [f[0] for f in CN_FIELDS if not f[0].startswith('filler')])
-    run_recipe(source, [fieldremover, emit_mysql])
+    run_recipe(source, fieldremover, emit_mysql)

    # contributions
    source = FixedWidthFileSource(open('%s/itcont.dta' % year), INDIV_FIELDS)
    decobolizer = FieldModifier(('amount', ), fix_cobol_number)
    #sqlite = SqliteOutput('fec%s.sqlite' % year, 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler'])
    emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler'])
-    run_recipe(source, [decobolizer, emit_mysql])
+    run_recipe(source, decobolizer, emit_mysql)

 if __name__=='__main__':
    process_fec_year(2008)
--- a/saucebrush/filters.py
+++ b/saucebrush/filters.py
@ -8,6 +8,7 @@
 """

 from exceptions import NotImplementedError
+from saucebrush import utils

 ######################
 ## Abstract Filters ##
@ -156,6 +157,9 @@ class FieldAdder(Filter):

        from itertools import count
        FieldAdder('id', count)
+        
+        would yield a new column named id that uses the itertools count iterable
+        to create sequentially numbered ids.
    """

    def __init__(self, field_name, field_value):
@ -214,6 +218,35 @@ class Splitter(Filter):
        return record


+class Flattener(Filter):
+    """ Collapse a set of similar dictionaries into a list.
+    
+        Takes a dictionary of keys and flattens the key names:
+        
+        addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
+                            {'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}]
+        flattener = Flattener(['addresses'])
+    """
+        
+    
+    def __init__(self):
+        super(Flattener, self).__init__()
+    
+    '''def process_field(self, item):
+        # create a list of dictionaries with concatenated keys
+        retlist = []
+        for subitem in item:
+            newitem = {}
+            for key1,subdict in subitem.iteritems():
+                for key2,value in subdict.iteritems():
+                    newitem[key1+'_'+key2] = value
+            retlist.append(newitem)
+        return retlist
+    '''
+    
+    def process_record(self, record):
+        return utils.flatten(record)
+    

 ###########################
 ## Commonly Used Filters ##
--- a/saucebrush/utils.py
+++ b/saucebrush/utils.py
@ -32,6 +32,28 @@ def string_dig(element, joiner=''):
        return joiner.join([string_dig(child) for child in element.findAll(True)])


+def flatten(item, prefix=''):
+    """
+        Flatten nested dictionary into one with its keys concatenated together.
+        
+        >>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}],
+                    'f':{'g':{'h':6}}})
+        {'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
+    """
+    if isinstance(item, dict):
+        # don't prepend a leading _
+        if prefix != '':
+            prefix += '_'
+        retval = {}
+        for key, value in item.iteritems():
+            retval.update(flatten(value, prefix + key))
+        return retval
+    elif isinstance(item, (tuple, list)):
+        return {prefix: [flatten(i) for i in item]}
+    else:
+        return {prefix: item}
+
+
 def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
    """
        Do a lookup within dict_ by the various elements of dotted_key.