things for sopr_lobbyists, including dotted_key_set, flattener, splitter, and name cleaner
This commit is contained in:
parent
0bbb1ee4e4
commit
07842557fb
32
examples/sopr_lobbyists.py
Normal file
32
examples/sopr_lobbyists.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
"""
|
||||||
|
Import SOPR lobbyist filings using lobbyists.py
|
||||||
|
(http://github.com/dhess/lobbyists/tree/master)
|
||||||
|
|
||||||
|
"""
|
||||||
|
import saucebrush
|
||||||
|
from saucebrush import utils
|
||||||
|
from saucebrush.filters import *
|
||||||
|
from saucebrush.emitters import DebugEmitter
|
||||||
|
import lobbyists
|
||||||
|
|
||||||
|
def process_sopr_filing(sopr_xml_file):
|
||||||
|
|
||||||
|
saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
|
||||||
|
FieldRemover(['govt_entities', 'affiliated_orgs']),
|
||||||
|
Flattener(['issues', 'lobbyists']),
|
||||||
|
saucebrush.filters.Splitter({
|
||||||
|
'client':[FieldRemover(['state_or_local_gov', 'status']), NameCleaner(['contact_name'])],
|
||||||
|
'filing':[FieldRemover(['affiliated_orgs_url'])],
|
||||||
|
'issues':[],
|
||||||
|
'lobbyists':[FieldRemover(['indicator', 'status']), NameCleaner(['name']), Unique()],
|
||||||
|
'registrant':[NameCleaner(['name'])],
|
||||||
|
}),
|
||||||
|
FieldCopier({'issues.filing_id': 'filing.id',
|
||||||
|
'client.filing_id': 'filing.id',
|
||||||
|
'lobbyists.filing_id': 'filing.id',
|
||||||
|
'registrant.filing_id': 'filing.id'}),
|
||||||
|
DebugEmitter(open('test.out','w')),
|
||||||
|
)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
process_sopr_filing('sample.xml')
|
@ -8,6 +8,7 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from saucebrush import utils
|
from saucebrush import utils
|
||||||
|
import re
|
||||||
|
|
||||||
######################
|
######################
|
||||||
## Abstract Filters ##
|
## Abstract Filters ##
|
||||||
@ -66,18 +67,46 @@ class FieldFilter(Filter):
|
|||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
""" Calls process_field on all keys passed to __init__. """
|
""" Calls process_field on all keys passed to __init__. """
|
||||||
|
|
||||||
for key in self._target_keys:
|
for key in self._target_keys:
|
||||||
record[key] = self.process_field(record[key])
|
try:
|
||||||
|
item = record[key]
|
||||||
|
record[key] = self.process_field(item)
|
||||||
|
except KeyError:
|
||||||
|
# probably want to have a boolean to flag missing fields
|
||||||
|
pass
|
||||||
return record
|
return record
|
||||||
|
|
||||||
def process_field(self, item):
|
def process_field(self, item):
|
||||||
""" Given a value, return the value that it should be replaced with. """
|
""" Given a value, return the value that it should be replaced with. """
|
||||||
|
|
||||||
raise NotImplementedError('process_field not defined in ' +
|
raise NotImplementedError('process_field not defined in ' +
|
||||||
self.__class__.__name__)
|
self.__class__.__name__)
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return '%s( %s )' % (self.__class__.__name__, str(self._target_keys))
|
return '%s( %s )' % (self.__class__.__name__, str(self._target_keys))
|
||||||
|
|
||||||
|
class ConditionalFilter(YieldFilter):
|
||||||
|
""" ABC for filters that only pass through records meeting a condition.
|
||||||
|
|
||||||
|
All derived filters must provide a test_record(self, record) that
|
||||||
|
returns True or False -- True indicating that the record should be
|
||||||
|
passed through, and False preventing pass through.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(ConditionalFilter, self).__init__()
|
||||||
|
|
||||||
|
def process_record(self, record):
|
||||||
|
""" Yields all records for which self.test_record is true """
|
||||||
|
|
||||||
|
if self.test_record(record):
|
||||||
|
yield record
|
||||||
|
|
||||||
|
def test_record(self, record):
|
||||||
|
""" Given a record, return True iff it should be passed on """
|
||||||
|
raise NotImplementedError('test_record not defined in ' +
|
||||||
|
self.__class__.__name__)
|
||||||
|
|
||||||
#####################
|
#####################
|
||||||
## Generic Filters ##
|
## Generic Filters ##
|
||||||
@ -180,6 +209,23 @@ class FieldAdder(Filter):
|
|||||||
return '%s( %s, %s )' % (self.__class__.__name__, self._field_name,
|
return '%s( %s, %s )' % (self.__class__.__name__, self._field_name,
|
||||||
str(self._field_value))
|
str(self._field_value))
|
||||||
|
|
||||||
|
class FieldCopier(Filter):
|
||||||
|
""" Filter that copies one field to another.
|
||||||
|
|
||||||
|
Takes a dictionary mapping destination keys to source keys.
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self, copy_mapping):
|
||||||
|
super(FieldCopier, self).__init__()
|
||||||
|
self._copy_mapping = copy_mapping
|
||||||
|
|
||||||
|
def process_record(self, record):
|
||||||
|
# mapping is dest:source
|
||||||
|
for dest, source in self._copy_mapping.iteritems():
|
||||||
|
srcval = utils.dotted_key_lookup(record, source)
|
||||||
|
utils.dotted_key_set(record, dest, srcval)
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
class Splitter(Filter):
|
class Splitter(Filter):
|
||||||
""" Filter that splits nested data into different paths.
|
""" Filter that splits nested data into different paths.
|
||||||
@ -199,7 +245,11 @@ class Splitter(Filter):
|
|||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
for key, filters in self._split_mapping.iteritems():
|
for key, filters in self._split_mapping.iteritems():
|
||||||
|
|
||||||
subrecord = record[key]
|
# if the key doesn't exist -- move on to next key
|
||||||
|
try:
|
||||||
|
subrecord = record[key]
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
|
||||||
# if a dict, use process_record directly
|
# if a dict, use process_record directly
|
||||||
if isinstance(subrecord, dict):
|
if isinstance(subrecord, dict):
|
||||||
@ -217,23 +267,47 @@ class Splitter(Filter):
|
|||||||
return record
|
return record
|
||||||
|
|
||||||
|
|
||||||
class Flattener(Filter):
|
class Flattener(FieldFilter):
|
||||||
""" Collapse a set of similar dictionaries into a list.
|
""" Collapse a set of similar dictionaries into a list.
|
||||||
|
|
||||||
Takes a dictionary of keys and flattens the key names:
|
Takes a dictionary of keys and flattens the key names:
|
||||||
|
|
||||||
addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
|
addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
|
||||||
{'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}]
|
{'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}]
|
||||||
flattener = Flattener(['addresses'])
|
flattener = Flattener(['addresses'])
|
||||||
|
|
||||||
|
would yield:
|
||||||
|
|
||||||
|
{'addresses': [{'state': 'NC', 'street': '146 shirley drive'},
|
||||||
|
{'state': 'NY', 'street': '3000 Winton Rd'}]}
|
||||||
|
"""
|
||||||
|
def __init__(self, keys):
|
||||||
|
super(Flattener, self).__init__(keys)
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
result = []
|
||||||
|
for d in item:
|
||||||
|
rec = {}
|
||||||
|
for values in d.values():
|
||||||
|
rec.update(values)
|
||||||
|
result.append(rec)
|
||||||
|
return result
|
||||||
|
|
||||||
|
class Unique(ConditionalFilter):
|
||||||
|
""" Filter that ensures that all records passing through are unique.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Flattener, self).__init__()
|
super(Unique, self).__init__()
|
||||||
|
self._seen = set()
|
||||||
def process_record(self, record):
|
|
||||||
return utils.flatten(record)
|
def test_record(self, record):
|
||||||
|
record_hash = hash(repr(record))
|
||||||
|
if record_hash not in self._seen:
|
||||||
|
self._seen.add(record_hash)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
## Commonly Used Filters ##
|
## Commonly Used Filters ##
|
||||||
@ -249,7 +323,6 @@ class PhoneNumberCleaner(FieldFilter):
|
|||||||
would format the phone & fax columns to 555-123-4567 format.
|
would format the phone & fax columns to 555-123-4567 format.
|
||||||
"""
|
"""
|
||||||
def __init__(self, keys, number_format='%s%s%s.%s%s%s.%s%s%s%s'):
|
def __init__(self, keys, number_format='%s%s%s.%s%s%s.%s%s%s%s'):
|
||||||
import re
|
|
||||||
super(PhoneNumberCleaner, self).__init__(keys)
|
super(PhoneNumberCleaner, self).__init__(keys)
|
||||||
self._number_format = number_format
|
self._number_format = number_format
|
||||||
self._num_re = re.compile('\d')
|
self._num_re = re.compile('\d')
|
||||||
@ -259,3 +332,55 @@ class PhoneNumberCleaner(FieldFilter):
|
|||||||
if len(nums) == 10:
|
if len(nums) == 10:
|
||||||
item = self._number_format % tuple(nums)
|
item = self._number_format % tuple(nums)
|
||||||
return item
|
return item
|
||||||
|
|
||||||
|
|
||||||
|
class NameCleaner(Filter):
|
||||||
|
""" Filter that splits names into a first, last, and middle name field.
|
||||||
|
|
||||||
|
Takes a list of target keys.
|
||||||
|
|
||||||
|
PhoneNumberCleaner( ('phone','fax'),
|
||||||
|
number_format='%s%s%s-%s%s%s-%s%s%s%s')
|
||||||
|
would format the phone & fax columns to 555-123-4567 format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# first middle? last suffix?
|
||||||
|
FIRST_LAST = re.compile('''^\s*(?:(?P<firstname>\w+)(?:\.?)
|
||||||
|
\s+(?:(?P<middlename>\w+)\.?\s+)?
|
||||||
|
(?P<lastname>[A-Za-z'-]+))
|
||||||
|
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
|
||||||
|
\s*$''', re.VERBOSE | re.IGNORECASE)
|
||||||
|
|
||||||
|
# last, first middle? suffix?
|
||||||
|
LAST_FIRST = re.compile('''^\s*(?:(?P<lastname>[A-Za-z'-]+),
|
||||||
|
\s+(?P<firstname>\w+)(?:\.?)
|
||||||
|
(?:\s+(?P<middlename>\w+)\.?)?)
|
||||||
|
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
|
||||||
|
\s*$''', re.VERBOSE | re.IGNORECASE)
|
||||||
|
|
||||||
|
def __init__(self, keys, name_formats=None):
|
||||||
|
super(NameCleaner, self).__init__()
|
||||||
|
self._keys = keys
|
||||||
|
if name_formats:
|
||||||
|
self._name_formats = name_formats
|
||||||
|
else:
|
||||||
|
self._name_formats = [self.FIRST_LAST, self.LAST_FIRST]
|
||||||
|
|
||||||
|
def process_record(self, record):
|
||||||
|
# run for each key (not using a FieldFilter due to multi-field output)
|
||||||
|
for key in self._keys:
|
||||||
|
name = record[key]
|
||||||
|
|
||||||
|
# check if key matches any formats
|
||||||
|
for format in self._name_formats:
|
||||||
|
match = format.match(name)
|
||||||
|
|
||||||
|
# if there is a match, remove original name and add pieces
|
||||||
|
if match:
|
||||||
|
record.pop(key)
|
||||||
|
for k,v in match.groupdict().iteritems():
|
||||||
|
record[k] = v
|
||||||
|
break
|
||||||
|
# can add else statement here to log non-names
|
||||||
|
|
||||||
|
return record
|
@ -19,7 +19,7 @@ def get_django_model(dj_settings, app_label, model_name):
|
|||||||
return get_model(app_label, model_name)
|
return get_model(app_label, model_name)
|
||||||
|
|
||||||
|
|
||||||
def string_dig(element, joiner=''):
|
def string_dig(element, separator=''):
|
||||||
"""
|
"""
|
||||||
Dig into BeautifulSoup HTML elements looking for inner strings.
|
Dig into BeautifulSoup HTML elements looking for inner strings.
|
||||||
|
|
||||||
@ -29,11 +29,11 @@ def string_dig(element, joiner=''):
|
|||||||
if element.string:
|
if element.string:
|
||||||
return element.string
|
return element.string
|
||||||
else:
|
else:
|
||||||
return joiner.join([string_dig(child)
|
return separator.join([string_dig(child)
|
||||||
for child in element.findAll(True)])
|
for child in element.findAll(True)])
|
||||||
|
|
||||||
|
|
||||||
def flatten(item, prefix=''):
|
def recursive_flatten(item, prefix=''):
|
||||||
"""
|
"""
|
||||||
Flatten nested dictionary into one with its keys concatenated together.
|
Flatten nested dictionary into one with its keys concatenated together.
|
||||||
|
|
||||||
@ -41,20 +41,22 @@ def flatten(item, prefix=''):
|
|||||||
'f':{'g':{'h':6}}})
|
'f':{'g':{'h':6}}})
|
||||||
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
|
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# update dictionaries recursively
|
||||||
|
|
||||||
if isinstance(item, dict):
|
if isinstance(item, dict):
|
||||||
# don't prepend a leading _
|
# don't prepend a leading _
|
||||||
if prefix != '':
|
if prefix != '':
|
||||||
prefix += '_'
|
prefix += '_'
|
||||||
retval = {}
|
retval = {}
|
||||||
for key, value in item.iteritems():
|
for key, value in item.iteritems():
|
||||||
retval.update(flatten(value, prefix + key))
|
retval.update(recursive_flatten(value, prefix + key))
|
||||||
return retval
|
return retval
|
||||||
elif isinstance(item, (tuple, list)):
|
elif isinstance(item, (tuple, list)):
|
||||||
return {prefix: [flatten(i) for i in item]}
|
return {prefix: [recursive_flatten(i) for i in item]}
|
||||||
else:
|
else:
|
||||||
return {prefix: item}
|
return {prefix: item}
|
||||||
|
|
||||||
|
|
||||||
def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
||||||
"""
|
"""
|
||||||
Do a lookup within dict_ by the various elements of dotted_key.
|
Do a lookup within dict_ by the various elements of dotted_key.
|
||||||
@ -69,7 +71,6 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
|||||||
-1
|
-1
|
||||||
>>> dotted_key_lookup(d, 'a|b|c', separator='|')
|
>>> dotted_key_lookup(d, 'a|b|c', separator='|')
|
||||||
3
|
3
|
||||||
>>> dotted_key_lookup(d, '
|
|
||||||
"""
|
"""
|
||||||
val = dict_
|
val = dict_
|
||||||
try:
|
try:
|
||||||
@ -85,3 +86,33 @@ def dotted_key_lookup(dict_, dotted_key, default=KeyError, separator='.'):
|
|||||||
raise
|
raise
|
||||||
val = default
|
val = default
|
||||||
return val
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def dotted_key_set(dict_or_list, dotted_key, value, separator='.'):
|
||||||
|
"""
|
||||||
|
Set a value within dict_ using a dotted_key.
|
||||||
|
|
||||||
|
>>> d = {}
|
||||||
|
>>> dotted_key_set(d, 'a.b.c', 123}
|
||||||
|
>>> d
|
||||||
|
{'a': {'b': {'c': 123}}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# split key into composite parts
|
||||||
|
keys = dotted_key.split(separator)
|
||||||
|
|
||||||
|
for i,key in enumerate(keys):
|
||||||
|
|
||||||
|
# if current location is a dictionary: traverse inward until @ last key
|
||||||
|
# set value when last key is reached
|
||||||
|
if isinstance(dict_or_list, dict):
|
||||||
|
if i == len(keys)-1:
|
||||||
|
dict_or_list[key] = value
|
||||||
|
else:
|
||||||
|
dict_or_list = dict_or_list.setdefault(key, {})
|
||||||
|
|
||||||
|
# if current location is a list: call dotted_key_set on each element
|
||||||
|
elif isinstance(dict_or_list, (tuple, list)):
|
||||||
|
newkey = separator.join(keys[i:])
|
||||||
|
for item in dict_or_list:
|
||||||
|
dotted_key_set(item, newkey, value, separator)
|
Loading…
Reference in New Issue
Block a user