DateCleaner, FieldRenamer, etc.

This commit is contained in:
James Turk 2008-11-13 21:31:42 +00:00
parent 07842557fb
commit 163d92d44b
4 changed files with 110 additions and 22 deletions

View File

@ -4,29 +4,47 @@
""" """
import saucebrush import saucebrush
from saucebrush import utils
from saucebrush.filters import * from saucebrush.filters import *
from saucebrush.emitters import DebugEmitter from saucebrush.emitters import DjangoModelEmitter, DebugEmitter
import lobbyists import lobbyists
def process_sopr_filing(sopr_xml_file): def process_sopr_filing(sopr_xml_file):
from sunlightapi import settings as DJ_SETTINGS
DJ_APPLABEL = 'lobbyists'
saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file), saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
FieldRemover(['govt_entities', 'affiliated_orgs']), FieldRemover(['govt_entities', 'affiliated_orgs']),
Flattener(['issues', 'lobbyists']), Flattener(['issues', 'lobbyists']),
saucebrush.filters.Splitter({ FieldCopier({'issues.filing_id': 'filing.id',
'client':[FieldRemover(['state_or_local_gov', 'status']), NameCleaner(['contact_name'])], 'client.filing_id': 'filing.id',
'filing':[FieldRemover(['affiliated_orgs_url'])], 'lobbyists.filing_id': 'filing.id',
'issues':[], 'registrant.filing_id': 'filing.id'}),
'lobbyists':[FieldRemover(['indicator', 'status']), NameCleaner(['name']), Unique()], saucebrush.filters.Splitter({
'registrant':[NameCleaner(['name'])], 'client':[FieldRemover(['state_or_local_gov', 'status']),
}), NameCleaner(['contact_name']),
FieldCopier({'issues.filing_id': 'filing.id', FieldRenamer({'raw_contact_name': 'contact_name'}),
'client.filing_id': 'filing.id', DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'client')
'lobbyists.filing_id': 'filing.id', ],
'registrant.filing_id': 'filing.id'}), 'filing':[FieldRemover(['affiliated_orgs_url']),
DebugEmitter(open('test.out','w')), DateCleaner(['filing_date'], from_format='%Y-%m-%dT00:00:00', to_format='%Y-%m-%d'),
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing')
],
'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')],
'lobbyists':[FieldRemover(['indicator', 'status']),
NameCleaner(['name']),
FieldRenamer({'raw_name': 'name'}),
Unique(),
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist')
],
'registrant':[NameCleaner(['name']),
FieldRenamer({'raw_name': 'name'}),
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'registrant')
],
}),
) )
if __name__ == '__main__': if __name__ == '__main__':
process_sopr_filing('sample.xml') import sys
for fname in sys.argv[1:]:
print 'processing', fname
process_sopr_filing(fname)

View File

@ -156,7 +156,9 @@ class DjangoModelEmitter(Emitter):
def __init__(self, dj_settings, app_label, model_name): def __init__(self, dj_settings, app_label, model_name):
super(DjangoModelEmitter, self).__init__() super(DjangoModelEmitter, self).__init__()
from saucebrush.utils import get_django_model from saucebrush.utils import get_django_model
self.dbmodel = get_django_model(dj_settings, app_label, model_name) self._dbmodel = get_django_model(dj_settings, app_label, model_name)
if not self._dbmodel:
raise Exception("No such model: %s %s" % (app_label, model_name))
def emit_record(self, record): def emit_record(self, record):
self.dbmodel.objects.create(**record) self._dbmodel.objects.create(**record)

View File

@ -9,6 +9,7 @@
from saucebrush import utils from saucebrush import utils
import re import re
import time
###################### ######################
## Abstract Filters ## ## Abstract Filters ##
@ -226,6 +227,26 @@ class FieldCopier(Filter):
utils.dotted_key_set(record, dest, srcval) utils.dotted_key_set(record, dest, srcval)
return record return record
class FieldRenamer(Filter):
""" Filter that renames one field to another.
Takes a dictionary mapping destination keys to source keys.
"""
def __init__(self, rename_mapping):
super(FieldRenamer, self).__init__()
self._rename_mapping = rename_mapping
def process_record(self, record):
# mapping is dest:source
for dest, source in self._rename_mapping.iteritems():
try:
srcval = utils.dotted_key_pop(record, source)
utils.dotted_key_set(record, dest, srcval)
except KeyError:
# silently pass if source key didn't exist
pass
return record
class Splitter(Filter): class Splitter(Filter):
""" Filter that splits nested data into different paths. """ Filter that splits nested data into different paths.
@ -333,6 +354,19 @@ class PhoneNumberCleaner(FieldFilter):
item = self._number_format % tuple(nums) item = self._number_format % tuple(nums)
return item return item
class DateCleaner(FieldFilter):
""" Filter that cleans dates to match a given format.
Takes a list of target keys and to and from formats in strftime format.
"""
def __init__(self, keys, from_format, to_format):
super(DateCleaner, self).__init__(keys)
self._from_format = from_format
self._to_format = to_format
def process_field(self, item):
return time.strftime(self._to_format,
time.strptime(item, self._from_format))
class NameCleaner(Filter): class NameCleaner(Filter):
""" Filter that splits names into a first, last, and middle name field. """ Filter that splits names into a first, last, and middle name field.

View File

@ -61,8 +61,8 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
""" """
Do a lookup within dict_ by the various elements of dotted_key. Do a lookup within dict_ by the various elements of dotted_key.
Optionally specifiy a default to return if key does not exist (similar Optionally specify a default to return if key does not exist (similar
to default to default)
>>> d = {'a': {'b': {'c': 3} } } >>> d = {'a': {'b': {'c': 3} } }
>>> dotted_key_lookup(d, 'a.b.c') >>> dotted_key_lookup(d, 'a.b.c')
@ -87,6 +87,37 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
val = default val = default
return val return val
def dotted_key_pop(dict_, dotted_key, default=KeyError, separator='.'):
"""
Delete a value within dict_ by the various elements of dotted_key.
"""
val = dict_
try:
key_parts = dotted_key.split(separator)
for key in key_parts[:-1]:
if isinstance(val, dict):
val = val[key]
elif isinstance(val, (list,tuple)):
val = val[int(key)]
else:
val = getattr(val, key)
# now with just the final part of the key
key = key_parts[-1]
if isinstance(val, dict):
retval = val[key]
del val[key]
elif isinstance(val, (list,tuple)):
retval = val[int(key)]
del val[int(key)]
else:
retval = getattr(val, key)
delattr(val, key)
except (KeyError, IndexError, AttributeError):
if default is KeyError:
raise
retval = default
return retval
def dotted_key_set(dict_or_list, dotted_key, value, separator='.'): def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
""" """
@ -109,7 +140,10 @@ def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
if i == len(keys)-1: if i == len(keys)-1:
dict_or_list[key] = value dict_or_list[key] = value
else: else:
dict_or_list = dict_or_list.setdefault(key, {}) try:
dict_or_list = dict_or_list[key]
except KeyError:
break
# if current location is a list: call dotted_key_set on each element # if current location is a list: call dotted_key_set on each element
elif isinstance(dict_or_list, (tuple, list)): elif isinstance(dict_or_list, (tuple, list)):