DateCleaner, FieldRenamer, etc.
This commit is contained in:
parent
07842557fb
commit
163d92d44b
@ -4,29 +4,47 @@
|
||||
|
||||
"""
|
||||
import saucebrush
|
||||
from saucebrush import utils
|
||||
from saucebrush.filters import *
|
||||
from saucebrush.emitters import DebugEmitter
|
||||
from saucebrush.emitters import DjangoModelEmitter, DebugEmitter
|
||||
import lobbyists
|
||||
|
||||
def process_sopr_filing(sopr_xml_file):
|
||||
from sunlightapi import settings as DJ_SETTINGS
|
||||
DJ_APPLABEL = 'lobbyists'
|
||||
|
||||
saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
|
||||
FieldRemover(['govt_entities', 'affiliated_orgs']),
|
||||
Flattener(['issues', 'lobbyists']),
|
||||
saucebrush.filters.Splitter({
|
||||
'client':[FieldRemover(['state_or_local_gov', 'status']), NameCleaner(['contact_name'])],
|
||||
'filing':[FieldRemover(['affiliated_orgs_url'])],
|
||||
'issues':[],
|
||||
'lobbyists':[FieldRemover(['indicator', 'status']), NameCleaner(['name']), Unique()],
|
||||
'registrant':[NameCleaner(['name'])],
|
||||
}),
|
||||
FieldCopier({'issues.filing_id': 'filing.id',
|
||||
'client.filing_id': 'filing.id',
|
||||
'lobbyists.filing_id': 'filing.id',
|
||||
'registrant.filing_id': 'filing.id'}),
|
||||
DebugEmitter(open('test.out','w')),
|
||||
FieldRemover(['govt_entities', 'affiliated_orgs']),
|
||||
Flattener(['issues', 'lobbyists']),
|
||||
FieldCopier({'issues.filing_id': 'filing.id',
|
||||
'client.filing_id': 'filing.id',
|
||||
'lobbyists.filing_id': 'filing.id',
|
||||
'registrant.filing_id': 'filing.id'}),
|
||||
saucebrush.filters.Splitter({
|
||||
'client':[FieldRemover(['state_or_local_gov', 'status']),
|
||||
NameCleaner(['contact_name']),
|
||||
FieldRenamer({'raw_contact_name': 'contact_name'}),
|
||||
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'client')
|
||||
],
|
||||
'filing':[FieldRemover(['affiliated_orgs_url']),
|
||||
DateCleaner(['filing_date'], from_format='%Y-%m-%dT00:00:00', to_format='%Y-%m-%d'),
|
||||
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing')
|
||||
],
|
||||
'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')],
|
||||
'lobbyists':[FieldRemover(['indicator', 'status']),
|
||||
NameCleaner(['name']),
|
||||
FieldRenamer({'raw_name': 'name'}),
|
||||
Unique(),
|
||||
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist')
|
||||
],
|
||||
'registrant':[NameCleaner(['name']),
|
||||
FieldRenamer({'raw_name': 'name'}),
|
||||
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'registrant')
|
||||
],
|
||||
}),
|
||||
)
|
||||
|
||||
if __name__ == '__main__':
|
||||
process_sopr_filing('sample.xml')
|
||||
import sys
|
||||
for fname in sys.argv[1:]:
|
||||
print 'processing', fname
|
||||
process_sopr_filing(fname)
|
@ -156,7 +156,9 @@ class DjangoModelEmitter(Emitter):
|
||||
def __init__(self, dj_settings, app_label, model_name):
|
||||
super(DjangoModelEmitter, self).__init__()
|
||||
from saucebrush.utils import get_django_model
|
||||
self.dbmodel = get_django_model(dj_settings, app_label, model_name)
|
||||
self._dbmodel = get_django_model(dj_settings, app_label, model_name)
|
||||
if not self._dbmodel:
|
||||
raise Exception("No such model: %s %s" % (app_label, model_name))
|
||||
|
||||
def emit_record(self, record):
|
||||
self.dbmodel.objects.create(**record)
|
||||
self._dbmodel.objects.create(**record)
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
from saucebrush import utils
|
||||
import re
|
||||
import time
|
||||
|
||||
######################
|
||||
## Abstract Filters ##
|
||||
@ -226,6 +227,26 @@ class FieldCopier(Filter):
|
||||
utils.dotted_key_set(record, dest, srcval)
|
||||
return record
|
||||
|
||||
class FieldRenamer(Filter):
|
||||
""" Filter that renames one field to another.
|
||||
|
||||
Takes a dictionary mapping destination keys to source keys.
|
||||
|
||||
"""
|
||||
def __init__(self, rename_mapping):
|
||||
super(FieldRenamer, self).__init__()
|
||||
self._rename_mapping = rename_mapping
|
||||
|
||||
def process_record(self, record):
|
||||
# mapping is dest:source
|
||||
for dest, source in self._rename_mapping.iteritems():
|
||||
try:
|
||||
srcval = utils.dotted_key_pop(record, source)
|
||||
utils.dotted_key_set(record, dest, srcval)
|
||||
except KeyError:
|
||||
# silently pass if source key didn't exist
|
||||
pass
|
||||
return record
|
||||
|
||||
class Splitter(Filter):
|
||||
""" Filter that splits nested data into different paths.
|
||||
@ -333,6 +354,19 @@ class PhoneNumberCleaner(FieldFilter):
|
||||
item = self._number_format % tuple(nums)
|
||||
return item
|
||||
|
||||
class DateCleaner(FieldFilter):
|
||||
""" Filter that cleans dates to match a given format.
|
||||
|
||||
Takes a list of target keys and to and from formats in strftime format.
|
||||
"""
|
||||
def __init__(self, keys, from_format, to_format):
|
||||
super(DateCleaner, self).__init__(keys)
|
||||
self._from_format = from_format
|
||||
self._to_format = to_format
|
||||
|
||||
def process_field(self, item):
|
||||
return time.strftime(self._to_format,
|
||||
time.strptime(item, self._from_format))
|
||||
|
||||
class NameCleaner(Filter):
|
||||
""" Filter that splits names into a first, last, and middle name field.
|
||||
|
@ -61,8 +61,8 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
||||
"""
|
||||
Do a lookup within dict_ by the various elements of dotted_key.
|
||||
|
||||
Optionally specifiy a default to return if key does not exist (similar
|
||||
to default
|
||||
Optionally specify a default to return if key does not exist (similar
|
||||
to default)
|
||||
|
||||
>>> d = {'a': {'b': {'c': 3} } }
|
||||
>>> dotted_key_lookup(d, 'a.b.c')
|
||||
@ -87,6 +87,37 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
||||
val = default
|
||||
return val
|
||||
|
||||
def dotted_key_pop(dict_, dotted_key, default=KeyError, separator='.'):
|
||||
"""
|
||||
Delete a value within dict_ by the various elements of dotted_key.
|
||||
"""
|
||||
val = dict_
|
||||
try:
|
||||
key_parts = dotted_key.split(separator)
|
||||
for key in key_parts[:-1]:
|
||||
if isinstance(val, dict):
|
||||
val = val[key]
|
||||
elif isinstance(val, (list,tuple)):
|
||||
val = val[int(key)]
|
||||
else:
|
||||
val = getattr(val, key)
|
||||
|
||||
# now with just the final part of the key
|
||||
key = key_parts[-1]
|
||||
if isinstance(val, dict):
|
||||
retval = val[key]
|
||||
del val[key]
|
||||
elif isinstance(val, (list,tuple)):
|
||||
retval = val[int(key)]
|
||||
del val[int(key)]
|
||||
else:
|
||||
retval = getattr(val, key)
|
||||
delattr(val, key)
|
||||
except (KeyError, IndexError, AttributeError):
|
||||
if default is KeyError:
|
||||
raise
|
||||
retval = default
|
||||
return retval
|
||||
|
||||
def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
|
||||
"""
|
||||
@ -109,7 +140,10 @@ def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
|
||||
if i == len(keys)-1:
|
||||
dict_or_list[key] = value
|
||||
else:
|
||||
dict_or_list = dict_or_list.setdefault(key, {})
|
||||
try:
|
||||
dict_or_list = dict_or_list[key]
|
||||
except KeyError:
|
||||
break
|
||||
|
||||
# if current location is a list: call dotted_key_set on each element
|
||||
elif isinstance(dict_or_list, (tuple, list)):
|
||||
|
Loading…
Reference in New Issue
Block a user