added flatten & flattener

This commit is contained in:
James Turk 2008-11-04 17:20:10 +00:00
parent 31ffeaff61
commit 8ead922b4c
3 changed files with 58 additions and 3 deletions

View File

@ -45,21 +45,21 @@ def process_fec_year(year):
source = FixedWidthFileSource(open('%s/foiacm.dta' % year), CM_FIELDS)
#sqlite = SqliteOutput('fec%s.sqlite' % year, 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler'])
emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'committee', [f[0] for f in CM_FIELDS if f[0] != 'filler'])
run_recipe(source, [emit_mysql])
run_recipe(source, emit_mysql)
# candidate
source = FixedWidthFileSource(open('%s/foiacn.dta' % year), CN_FIELDS)
fieldremover = FieldRemover(('fillerA', 'fillerB'))
#sqlite = SqliteOutput('fec%s.sqlite' % year, 'candidate', [f[0] for f in CN_FIELDS if f[0] != 'filler'])
emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'candidate', [f[0] for f in CN_FIELDS if not f[0].startswith('filler')])
run_recipe(source, [fieldremover, emit_mysql])
run_recipe(source, fieldremover, emit_mysql)
# contributions
source = FixedWidthFileSource(open('%s/itcont.dta' % year), INDIV_FIELDS)
decobolizer = FieldModifier(('amount', ), fix_cobol_number)
#sqlite = SqliteOutput('fec%s.sqlite' % year, 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler'])
emit_mysql = SqlDumpEmitter(open('fec%s.sql' % year,'a'), 'contribution', [f[0] for f in INDIV_FIELDS if f[0] != 'filler'])
run_recipe(source, [decobolizer, emit_mysql])
run_recipe(source, decobolizer, emit_mysql)
if __name__=='__main__':
process_fec_year(2008)

View File

@ -8,6 +8,7 @@
"""
from exceptions import NotImplementedError
from saucebrush import utils
######################
## Abstract Filters ##
@ -156,6 +157,9 @@ class FieldAdder(Filter):
from itertools import count
FieldAdder('id', count)
would yield a new column named id that uses the itertools count iterable
to create sequentially numbered ids.
"""
def __init__(self, field_name, field_value):
@ -214,6 +218,35 @@ class Splitter(Filter):
return record
class Flattener(Filter):
""" Collapse a set of similar dictionaries into a list.
Takes a dictionary of keys and flattens the key names:
addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
{'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}]
flattener = Flattener(['addresses'])
"""
def __init__(self):
super(Flattener, self).__init__()
'''def process_field(self, item):
# create a list of dictionaries with concatenated keys
retlist = []
for subitem in item:
newitem = {}
for key1,subdict in subitem.iteritems():
for key2,value in subdict.iteritems():
newitem[key1+'_'+key2] = value
retlist.append(newitem)
return retlist
'''
def process_record(self, record):
return utils.flatten(record)
###########################
## Commonly Used Filters ##

View File

@ -32,6 +32,28 @@ def string_dig(element, joiner=''):
return joiner.join([string_dig(child) for child in element.findAll(True)])
def flatten(item, prefix=''):
"""
Flatten nested dictionary into one with its keys concatenated together.
>>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}],
'f':{'g':{'h':6}}})
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
"""
if isinstance(item, dict):
# don't prepend a leading _
if prefix != '':
prefix += '_'
retval = {}
for key, value in item.iteritems():
retval.update(flatten(value, prefix + key))
return retval
elif isinstance(item, (tuple, list)):
return {prefix: [flatten(i) for i in item]}
else:
return {prefix: item}
def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
"""
Do a lookup within dict_ by the various elements of dotted_key.