diff --git a/examples/sopr_lobbyists.py b/examples/sopr_lobbyists.py index 9cd9003..609855c 100644 --- a/examples/sopr_lobbyists.py +++ b/examples/sopr_lobbyists.py @@ -4,29 +4,47 @@ """ import saucebrush -from saucebrush import utils from saucebrush.filters import * -from saucebrush.emitters import DebugEmitter +from saucebrush.emitters import DjangoModelEmitter, DebugEmitter import lobbyists def process_sopr_filing(sopr_xml_file): + from sunlightapi import settings as DJ_SETTINGS + DJ_APPLABEL = 'lobbyists' saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file), - FieldRemover(['govt_entities', 'affiliated_orgs']), - Flattener(['issues', 'lobbyists']), - saucebrush.filters.Splitter({ - 'client':[FieldRemover(['state_or_local_gov', 'status']), NameCleaner(['contact_name'])], - 'filing':[FieldRemover(['affiliated_orgs_url'])], - 'issues':[], - 'lobbyists':[FieldRemover(['indicator', 'status']), NameCleaner(['name']), Unique()], - 'registrant':[NameCleaner(['name'])], - }), - FieldCopier({'issues.filing_id': 'filing.id', - 'client.filing_id': 'filing.id', - 'lobbyists.filing_id': 'filing.id', - 'registrant.filing_id': 'filing.id'}), - DebugEmitter(open('test.out','w')), + FieldRemover(['govt_entities', 'affiliated_orgs']), + Flattener(['issues', 'lobbyists']), + FieldCopier({'issues.filing_id': 'filing.id', + 'client.filing_id': 'filing.id', + 'lobbyists.filing_id': 'filing.id', + 'registrant.filing_id': 'filing.id'}), + saucebrush.filters.Splitter({ + 'client':[FieldRemover(['state_or_local_gov', 'status']), + NameCleaner(['contact_name']), + FieldRenamer({'raw_contact_name': 'contact_name'}), + DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'client') + ], + 'filing':[FieldRemover(['affiliated_orgs_url']), + DateCleaner(['filing_date'], from_format='%Y-%m-%dT00:00:00', to_format='%Y-%m-%d'), + DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing') + ], + 'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')], + 'lobbyists':[FieldRemover(['indicator', 'status']), + NameCleaner(['name']), + FieldRenamer({'raw_name': 'name'}), + Unique(), + DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist') + ], + 'registrant':[NameCleaner(['name']), + FieldRenamer({'raw_name': 'name'}), + DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'registrant') + ], + }), ) if __name__ == '__main__': - process_sopr_filing('sample.xml') \ No newline at end of file + import sys + for fname in sys.argv[1:]: + print 'processing', fname + process_sopr_filing(fname) \ No newline at end of file diff --git a/saucebrush/emitters.py b/saucebrush/emitters.py index 58129ec..7a7996e 100644 --- a/saucebrush/emitters.py +++ b/saucebrush/emitters.py @@ -156,7 +156,9 @@ class DjangoModelEmitter(Emitter): def __init__(self, dj_settings, app_label, model_name): super(DjangoModelEmitter, self).__init__() from saucebrush.utils import get_django_model - self.dbmodel = get_django_model(dj_settings, app_label, model_name) + self._dbmodel = get_django_model(dj_settings, app_label, model_name) + if not self._dbmodel: + raise Exception("No such model: %s %s" % (app_label, model_name)) def emit_record(self, record): - self.dbmodel.objects.create(**record) + self._dbmodel.objects.create(**record) diff --git a/saucebrush/filters.py b/saucebrush/filters.py index 13de619..836e7e2 100644 --- a/saucebrush/filters.py +++ b/saucebrush/filters.py @@ -9,6 +9,7 @@ from saucebrush import utils import re +import time ###################### ## Abstract Filters ## @@ -225,7 +226,27 @@ class FieldCopier(Filter): srcval = utils.dotted_key_lookup(record, source) utils.dotted_key_set(record, dest, srcval) return record + +class FieldRenamer(Filter): + """ Filter that renames one field to another. + Takes a dictionary mapping destination keys to source keys. + + """ + def __init__(self, rename_mapping): + super(FieldRenamer, self).__init__() + self._rename_mapping = rename_mapping + + def process_record(self, record): + # mapping is dest:source + for dest, source in self._rename_mapping.iteritems(): + try: + srcval = utils.dotted_key_pop(record, source) + utils.dotted_key_set(record, dest, srcval) + except KeyError: + # silently pass if source key didn't exist + pass + return record class Splitter(Filter): """ Filter that splits nested data into different paths. @@ -333,6 +354,19 @@ class PhoneNumberCleaner(FieldFilter): item = self._number_format % tuple(nums) return item +class DateCleaner(FieldFilter): + """ Filter that cleans dates to match a given format. + + Takes a list of target keys and to and from formats in strftime format. + """ + def __init__(self, keys, from_format, to_format): + super(DateCleaner, self).__init__(keys) + self._from_format = from_format + self._to_format = to_format + + def process_field(self, item): + return time.strftime(self._to_format, + time.strptime(item, self._from_format)) class NameCleaner(Filter): """ Filter that splits names into a first, last, and middle name field. diff --git a/saucebrush/utils.py b/saucebrush/utils.py index 079ef0b..90ec8e4 100644 --- a/saucebrush/utils.py +++ b/saucebrush/utils.py @@ -61,8 +61,8 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'): """ Do a lookup within dict_ by the various elements of dotted_key. - Optionally specifiy a default to return if key does not exist (similar - to default + Optionally specify a default to return if key does not exist (similar + to default) >>> d = {'a': {'b': {'c': 3} } } >>> dotted_key_lookup(d, 'a.b.c') @@ -87,6 +87,37 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'): val = default return val +def dotted_key_pop(dict_, dotted_key, default=KeyError, separator='.'): + """ + Delete a value within dict_ by the various elements of dotted_key. + """ + val = dict_ + try: + key_parts = dotted_key.split(separator) + for key in key_parts[:-1]: + if isinstance(val, dict): + val = val[key] + elif isinstance(val, (list,tuple)): + val = val[int(key)] + else: + val = getattr(val, key) + + # now with just the final part of the key + key = key_parts[-1] + if isinstance(val, dict): + retval = val[key] + del val[key] + elif isinstance(val, (list,tuple)): + retval = val[int(key)] + del val[int(key)] + else: + retval = getattr(val, key) + delattr(val, key) + except (KeyError, IndexError, AttributeError): + if default is KeyError: + raise + retval = default + return retval def dotted_key_set(dict_or_list, dotted_key, value, separator='.'): """ @@ -109,7 +140,10 @@ def dotted_key_set(dict_or_list, dotted_key, value, separator='.'): if i == len(keys)-1: dict_or_list[key] = value else: - dict_or_list = dict_or_list.setdefault(key, {}) + try: + dict_or_list = dict_or_list[key] + except KeyError: + break # if current location is a list: call dotted_key_set on each element elif isinstance(dict_or_list, (tuple, list)):