saucebrush/examples/sopr_lobbyists.py

54 lines
2.2 KiB
Python
Raw Normal View History

"""
Import SOPR lobbyist filings using lobbyists.py
(http://github.com/dhess/lobbyists/tree/master)
"""
import saucebrush
from saucebrush.filters import *
2008-11-13 21:31:42 +00:00
from saucebrush.emitters import DjangoModelEmitter, DebugEmitter
import lobbyists
def process_sopr_filing(sopr_xml_file):
2008-11-13 21:31:42 +00:00
from sunlightapi import settings as DJ_SETTINGS
DJ_APPLABEL = 'lobbyists'
2008-11-22 00:49:18 +00:00
saucebrush.run_recipe(lobbyists.parse_filings(sopr_xml_file),
# flatten non-list dictionaries & clean up some fields
DictFlattener(['filing', 'client', 'registrant']),
FieldRemover(['govt_entities', 'affiliated_orgs', 'foreign_entities',
'client_state_or_local_gov', 'client_status',
'filing_affiliated_orgs_url']),
FieldRenamer({'filing_date': 'filing_filing_date'}),
# process names & dates
FieldAdder('client_contact_name', ''),
FieldAdder('registrant_name', ''),
NameCleaner('client_contact_name', prefix='client_', nomatch_name='client_raw_contact_name'),
NameCleaner('registrant_name', prefix='registrant_', nomatch_name='registrant_raw_name'),
FieldModifier('filing_date', lambda x: x.split('.')[0]),
DateCleaner('filing_date', from_format='%Y-%m-%dT%H:%M:%S', to_format='%Y-%m-%d'),
# flatten lists
2008-11-13 21:31:42 +00:00
Flattener(['issues', 'lobbyists']),
2008-11-22 00:49:18 +00:00
FieldCopier({'issues.filing_id': 'filing_id',
'lobbyists.filing_id': 'filing_id'}),
# handle lists
2008-11-13 21:31:42 +00:00
saucebrush.filters.Splitter({
'issues':[DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'issue')],
'lobbyists':[FieldRemover(['indicator', 'status']),
NameCleaner(['name']),
FieldRenamer({'raw_name': 'name'}),
2008-11-22 00:49:18 +00:00
Unique(), # remove some duplicate lobbyists on a form
2008-11-13 21:31:42 +00:00
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'lobbyist')
],
}),
2008-11-22 00:49:18 +00:00
FieldRemover(['issues', 'lobbyists']),
DjangoModelEmitter(DJ_SETTINGS, DJ_APPLABEL, 'filing')
)
if __name__ == '__main__':
2008-11-13 21:31:42 +00:00
import sys
for fname in sys.argv[1:]:
print 'processing', fname
process_sopr_filing(fname)