DateCleaner, FieldRenamer, etc.
This commit is contained in:
parent
07842557fb
commit
163d92d44b
@ -4,29 +4,47 @@
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
import saucebrush
|
import saucebrush
|
||||||
from saucebrush import utils
|
|
||||||
from saucebrush.filters import *
|
from saucebrush.filters import *
|
||||||
from saucebrush.emitters import DebugEmitter
|
from saucebrush.emitters import DjangoModelEmitter, DebugEmitter
|
||||||
import lobbyists
|
import lobbyists
|
||||||
|
|
||||||
def process_sopr_filing(sopr_xml_file):
|
def process_sopr_filing(sopr_xml_file):
|
||||||
|
from sunlightapi import settings as DJ_SETTINGS
|
||||||
|
DJ_APPLABEL = 'lobbyists'
|
||||||
|
|
||||||
saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
|
saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
|
||||||
FieldRemover(['govt_entities', 'affiliated_orgs']),
|
FieldRemover(['govt_entities', 'affiliated_orgs']),
|
||||||
Flattener(['issues', 'lobbyists']),
|
Flattener(['issues', 'lobbyists']),
|
||||||
saucebrush.filters.Splitter({
|
FieldCopier({'issues.filing_id': 'filing.id',
|
||||||
'client':[FieldRemover(['state_or_local_gov', 'status']), NameCleaner(['contact_name'])],
|
'client.filing_id': 'filing.id',
|
||||||
'filing':[FieldRemover(['affiliated_orgs_url'])],
|
'lobbyists.filing_id': 'filing.id',
|
||||||
'issues':[],
|
'registrant.filing_id': 'filing.id'}),
|
||||||
'lobbyists':[FieldRemover(['indicator', 'status']), NameCleaner(['name']), Unique()],
|
saucebrush.filters.Splitter({
|
||||||
'registrant':[NameCleaner(['name'])],
|
'client':[FieldRemover(['state_or_local_gov', 'status']),
|
||||||
}),
|
NameCleaner(['contact_name']),
|
||||||
FieldCopier({'issues.filing_id': 'filing.id',
|
FieldRenamer({'raw_contact_name': 'contact_name'}),
|
||||||
'client.filing_id': 'filing.id',
|
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'client')
|
||||||
'lobbyists.filing_id': 'filing.id',
|
],
|
||||||
'registrant.filing_id': 'filing.id'}),
|
'filing':[FieldRemover(['affiliated_orgs_url']),
|
||||||
DebugEmitter(open('test.out','w')),
|
DateCleaner(['filing_date'], from_format='%Y-%m-%dT00:00:00', to_format='%Y-%m-%d'),
|
||||||
|
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing')
|
||||||
|
],
|
||||||
|
'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')],
|
||||||
|
'lobbyists':[FieldRemover(['indicator', 'status']),
|
||||||
|
NameCleaner(['name']),
|
||||||
|
FieldRenamer({'raw_name': 'name'}),
|
||||||
|
Unique(),
|
||||||
|
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist')
|
||||||
|
],
|
||||||
|
'registrant':[NameCleaner(['name']),
|
||||||
|
FieldRenamer({'raw_name': 'name'}),
|
||||||
|
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'registrant')
|
||||||
|
],
|
||||||
|
}),
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
process_sopr_filing('sample.xml')
|
import sys
|
||||||
|
for fname in sys.argv[1:]:
|
||||||
|
print 'processing', fname
|
||||||
|
process_sopr_filing(fname)
|
@ -156,7 +156,9 @@ class DjangoModelEmitter(Emitter):
|
|||||||
def __init__(self, dj_settings, app_label, model_name):
|
def __init__(self, dj_settings, app_label, model_name):
|
||||||
super(DjangoModelEmitter, self).__init__()
|
super(DjangoModelEmitter, self).__init__()
|
||||||
from saucebrush.utils import get_django_model
|
from saucebrush.utils import get_django_model
|
||||||
self.dbmodel = get_django_model(dj_settings, app_label, model_name)
|
self._dbmodel = get_django_model(dj_settings, app_label, model_name)
|
||||||
|
if not self._dbmodel:
|
||||||
|
raise Exception("No such model: %s %s" % (app_label, model_name))
|
||||||
|
|
||||||
def emit_record(self, record):
|
def emit_record(self, record):
|
||||||
self.dbmodel.objects.create(**record)
|
self._dbmodel.objects.create(**record)
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
from saucebrush import utils
|
from saucebrush import utils
|
||||||
import re
|
import re
|
||||||
|
import time
|
||||||
|
|
||||||
######################
|
######################
|
||||||
## Abstract Filters ##
|
## Abstract Filters ##
|
||||||
@ -226,6 +227,26 @@ class FieldCopier(Filter):
|
|||||||
utils.dotted_key_set(record, dest, srcval)
|
utils.dotted_key_set(record, dest, srcval)
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
class FieldRenamer(Filter):
|
||||||
|
""" Filter that renames one field to another.
|
||||||
|
|
||||||
|
Takes a dictionary mapping destination keys to source keys.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, rename_mapping):
|
||||||
|
super(FieldRenamer, self).__init__()
|
||||||
|
self._rename_mapping = rename_mapping
|
||||||
|
|
||||||
|
def process_record(self, record):
|
||||||
|
# mapping is dest:source
|
||||||
|
for dest, source in self._rename_mapping.iteritems():
|
||||||
|
try:
|
||||||
|
srcval = utils.dotted_key_pop(record, source)
|
||||||
|
utils.dotted_key_set(record, dest, srcval)
|
||||||
|
except KeyError:
|
||||||
|
# silently pass if source key didn't exist
|
||||||
|
pass
|
||||||
|
return record
|
||||||
|
|
||||||
class Splitter(Filter):
|
class Splitter(Filter):
|
||||||
""" Filter that splits nested data into different paths.
|
""" Filter that splits nested data into different paths.
|
||||||
@ -333,6 +354,19 @@ class PhoneNumberCleaner(FieldFilter):
|
|||||||
item = self._number_format % tuple(nums)
|
item = self._number_format % tuple(nums)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
class DateCleaner(FieldFilter):
|
||||||
|
""" Filter that cleans dates to match a given format.
|
||||||
|
|
||||||
|
Takes a list of target keys and to and from formats in strftime format.
|
||||||
|
"""
|
||||||
|
def __init__(self, keys, from_format, to_format):
|
||||||
|
super(DateCleaner, self).__init__(keys)
|
||||||
|
self._from_format = from_format
|
||||||
|
self._to_format = to_format
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
return time.strftime(self._to_format,
|
||||||
|
time.strptime(item, self._from_format))
|
||||||
|
|
||||||
class NameCleaner(Filter):
|
class NameCleaner(Filter):
|
||||||
""" Filter that splits names into a first, last, and middle name field.
|
""" Filter that splits names into a first, last, and middle name field.
|
||||||
|
@ -61,8 +61,8 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
|||||||
"""
|
"""
|
||||||
Do a lookup within dict_ by the various elements of dotted_key.
|
Do a lookup within dict_ by the various elements of dotted_key.
|
||||||
|
|
||||||
Optionally specifiy a default to return if key does not exist (similar
|
Optionally specify a default to return if key does not exist (similar
|
||||||
to default
|
to default)
|
||||||
|
|
||||||
>>> d = {'a': {'b': {'c': 3} } }
|
>>> d = {'a': {'b': {'c': 3} } }
|
||||||
>>> dotted_key_lookup(d, 'a.b.c')
|
>>> dotted_key_lookup(d, 'a.b.c')
|
||||||
@ -87,6 +87,37 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
|||||||
val = default
|
val = default
|
||||||
return val
|
return val
|
||||||
|
|
||||||
|
def dotted_key_pop(dict_, dotted_key, default=KeyError, separator='.'):
|
||||||
|
"""
|
||||||
|
Delete a value within dict_ by the various elements of dotted_key.
|
||||||
|
"""
|
||||||
|
val = dict_
|
||||||
|
try:
|
||||||
|
key_parts = dotted_key.split(separator)
|
||||||
|
for key in key_parts[:-1]:
|
||||||
|
if isinstance(val, dict):
|
||||||
|
val = val[key]
|
||||||
|
elif isinstance(val, (list,tuple)):
|
||||||
|
val = val[int(key)]
|
||||||
|
else:
|
||||||
|
val = getattr(val, key)
|
||||||
|
|
||||||
|
# now with just the final part of the key
|
||||||
|
key = key_parts[-1]
|
||||||
|
if isinstance(val, dict):
|
||||||
|
retval = val[key]
|
||||||
|
del val[key]
|
||||||
|
elif isinstance(val, (list,tuple)):
|
||||||
|
retval = val[int(key)]
|
||||||
|
del val[int(key)]
|
||||||
|
else:
|
||||||
|
retval = getattr(val, key)
|
||||||
|
delattr(val, key)
|
||||||
|
except (KeyError, IndexError, AttributeError):
|
||||||
|
if default is KeyError:
|
||||||
|
raise
|
||||||
|
retval = default
|
||||||
|
return retval
|
||||||
|
|
||||||
def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
|
def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
|
||||||
"""
|
"""
|
||||||
@ -109,7 +140,10 @@ def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
|
|||||||
if i == len(keys)-1:
|
if i == len(keys)-1:
|
||||||
dict_or_list[key] = value
|
dict_or_list[key] = value
|
||||||
else:
|
else:
|
||||||
dict_or_list = dict_or_list.setdefault(key, {})
|
try:
|
||||||
|
dict_or_list = dict_or_list[key]
|
||||||
|
except KeyError:
|
||||||
|
break
|
||||||
|
|
||||||
# if current location is a list: call dotted_key_set on each element
|
# if current location is a list: call dotted_key_set on each element
|
||||||
elif isinstance(dict_or_list, (tuple, list)):
|
elif isinstance(dict_or_list, (tuple, list)):
|
||||||
|
Loading…
Reference in New Issue
Block a user