cleanup of basic filters from writing tests
This commit is contained in:
parent
079857c961
commit
30cb36e17e
@ -72,7 +72,7 @@ class FieldFilter(Filter):
|
|||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
""" Calls process_field on all keys passed to __init__. """
|
""" Calls process_field on all keys passed to __init__. """
|
||||||
|
|
||||||
for key in self._target_keys:
|
for key in self._target_keys:
|
||||||
try:
|
try:
|
||||||
item = record[key]
|
item = record[key]
|
||||||
@ -84,7 +84,7 @@ class FieldFilter(Filter):
|
|||||||
|
|
||||||
def process_field(self, item):
|
def process_field(self, item):
|
||||||
""" Given a value, return the value that it should be replaced with. """
|
""" Given a value, return the value that it should be replaced with. """
|
||||||
|
|
||||||
raise NotImplementedError('process_field not defined in ' +
|
raise NotImplementedError('process_field not defined in ' +
|
||||||
self.__class__.__name__)
|
self.__class__.__name__)
|
||||||
|
|
||||||
@ -93,18 +93,14 @@ class FieldFilter(Filter):
|
|||||||
|
|
||||||
class ConditionalFilter(YieldFilter):
|
class ConditionalFilter(YieldFilter):
|
||||||
""" ABC for filters that only pass through records meeting a condition.
|
""" ABC for filters that only pass through records meeting a condition.
|
||||||
|
|
||||||
All derived filters must provide a test_record(self, record) that
|
All derived filters must provide a test_record(self, record) that
|
||||||
returns True or False -- True indicating that the record should be
|
returns True or False -- True indicating that the record should be
|
||||||
passed through, and False preventing pass through.
|
passed through, and False preventing pass through.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(ConditionalFilter, self).__init__()
|
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
""" Yields all records for which self.test_record is true """
|
""" Yields all records for which self.test_record is true """
|
||||||
|
|
||||||
if self.test_record(record):
|
if self.test_record(record):
|
||||||
yield record
|
yield record
|
||||||
|
|
||||||
@ -194,7 +190,7 @@ class FieldAdder(Filter):
|
|||||||
|
|
||||||
from itertools import count
|
from itertools import count
|
||||||
FieldAdder('id', count)
|
FieldAdder('id', count)
|
||||||
|
|
||||||
would yield a new column named id that uses the itertools count iterable
|
would yield a new column named id that uses the itertools count iterable
|
||||||
to create sequentially numbered ids.
|
to create sequentially numbered ids.
|
||||||
"""
|
"""
|
||||||
@ -208,13 +204,11 @@ class FieldAdder(Filter):
|
|||||||
self._replace = replace
|
self._replace = replace
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
if self._field_name not in record:
|
if self._field_name not in record or self._replace:
|
||||||
if callable(self._field_value):
|
if callable(self._field_value):
|
||||||
record[self._field_name] = self._field_value()
|
record[self._field_name] = self._field_value()
|
||||||
else:
|
else:
|
||||||
record[self._field_name] = self._field_value
|
record[self._field_name] = self._field_value
|
||||||
elif self._replace:
|
|
||||||
record[self._field_name] = self._field_value
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
@ -223,14 +217,14 @@ class FieldAdder(Filter):
|
|||||||
|
|
||||||
class FieldCopier(Filter):
|
class FieldCopier(Filter):
|
||||||
""" Filter that copies one field to another.
|
""" Filter that copies one field to another.
|
||||||
|
|
||||||
Takes a dictionary mapping destination keys to source keys.
|
Takes a dictionary mapping destination keys to source keys.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, copy_mapping):
|
def __init__(self, copy_mapping):
|
||||||
super(FieldCopier, self).__init__()
|
super(FieldCopier, self).__init__()
|
||||||
self._copy_mapping = copy_mapping
|
self._copy_mapping = copy_mapping
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
# mapping is dest:source
|
# mapping is dest:source
|
||||||
for dest, source in self._copy_mapping.iteritems():
|
for dest, source in self._copy_mapping.iteritems():
|
||||||
@ -239,22 +233,17 @@ class FieldCopier(Filter):
|
|||||||
|
|
||||||
class FieldRenamer(Filter):
|
class FieldRenamer(Filter):
|
||||||
""" Filter that renames one field to another.
|
""" Filter that renames one field to another.
|
||||||
|
|
||||||
Takes a dictionary mapping destination keys to source keys.
|
Takes a dictionary mapping destination keys to source keys.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, rename_mapping):
|
def __init__(self, rename_mapping):
|
||||||
super(FieldRenamer, self).__init__()
|
super(FieldRenamer, self).__init__()
|
||||||
self._rename_mapping = rename_mapping
|
self._rename_mapping = rename_mapping
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
# mapping is dest:source
|
# mapping is dest:source
|
||||||
for dest, source in self._rename_mapping.iteritems():
|
for dest, source in self._rename_mapping.iteritems():
|
||||||
try:
|
record[dest] = record.pop(source)
|
||||||
record[dest] = record.pop(source)
|
|
||||||
except KeyError:
|
|
||||||
# silently pass if source key didn't exist
|
|
||||||
pass
|
|
||||||
return record
|
return record
|
||||||
|
|
||||||
class Splitter(Filter):
|
class Splitter(Filter):
|
||||||
@ -299,7 +288,7 @@ class Splitter(Filter):
|
|||||||
|
|
||||||
class Flattener(FieldFilter):
|
class Flattener(FieldFilter):
|
||||||
""" Collapse a set of similar dictionaries into a list.
|
""" Collapse a set of similar dictionaries into a list.
|
||||||
|
|
||||||
Takes a dictionary of keys and flattens the key names:
|
Takes a dictionary of keys and flattens the key names:
|
||||||
|
|
||||||
addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
|
addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
|
||||||
@ -313,7 +302,7 @@ class Flattener(FieldFilter):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, keys):
|
def __init__(self, keys):
|
||||||
super(Flattener, self).__init__(keys)
|
super(Flattener, self).__init__(keys)
|
||||||
|
|
||||||
def process_field(self, item):
|
def process_field(self, item):
|
||||||
result = []
|
result = []
|
||||||
for d in item:
|
for d in item:
|
||||||
@ -323,13 +312,13 @@ class Flattener(FieldFilter):
|
|||||||
result.append(rec)
|
result.append(rec)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
class DictFlattener(Filter):
|
class DictFlattener(Filter):
|
||||||
def __init__(self, keys, separator='_'):
|
def __init__(self, keys, separator='_'):
|
||||||
super(DictFlattener, self).__init__()
|
super(DictFlattener, self).__init__()
|
||||||
self._keys = utils.str_or_list(keys)
|
self._keys = utils.str_or_list(keys)
|
||||||
self._separator = separator
|
self._separator = separator
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
return utils.flatten(record, keys=self._keys, separator=self._separator)
|
return utils.flatten(record, keys=self._keys, separator=self._separator)
|
||||||
|
|
||||||
@ -337,11 +326,11 @@ class DictFlattener(Filter):
|
|||||||
class Unique(ConditionalFilter):
|
class Unique(ConditionalFilter):
|
||||||
""" Filter that ensures that all records passing through are unique.
|
""" Filter that ensures that all records passing through are unique.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super(Unique, self).__init__()
|
super(Unique, self).__init__()
|
||||||
self._seen = set()
|
self._seen = set()
|
||||||
|
|
||||||
def test_record(self, record):
|
def test_record(self, record):
|
||||||
record_hash = hash(repr(record))
|
record_hash = hash(repr(record))
|
||||||
if record_hash not in self._seen:
|
if record_hash not in self._seen:
|
||||||
@ -353,12 +342,12 @@ class Unique(ConditionalFilter):
|
|||||||
class UnicodeFilter(Filter):
|
class UnicodeFilter(Filter):
|
||||||
""" Convert all str elements in the record to Unicode.
|
""" Convert all str elements in the record to Unicode.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, encoding='utf-8', errors='ignore'):
|
def __init__(self, encoding='utf-8', errors='ignore'):
|
||||||
super(UnicodeFilter, self).__init__()
|
super(UnicodeFilter, self).__init__()
|
||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._errors = errors
|
self._errors = errors
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
for key, value in record.iteritems():
|
for key, value in record.iteritems():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
@ -368,18 +357,18 @@ class UnicodeFilter(Filter):
|
|||||||
return record
|
return record
|
||||||
|
|
||||||
class StringFilter(Filter):
|
class StringFilter(Filter):
|
||||||
|
|
||||||
def __init__(self, encoding='utf-8', errors='ignore'):
|
def __init__(self, encoding='utf-8', errors='ignore'):
|
||||||
super(UnicodeFilter, self).__init__()
|
super(UnicodeFilter, self).__init__()
|
||||||
self._encoding = encoding
|
self._encoding = encoding
|
||||||
self._errors = errors
|
self._errors = errors
|
||||||
|
|
||||||
def process_record(self, record):
|
def process_record(self, record):
|
||||||
for key, value in record.iteritems():
|
for key, value in record.iteritems():
|
||||||
if isinstance(value, unicode):
|
if isinstance(value, unicode):
|
||||||
record[key] = value.encode(self._encoding, self._errors)
|
record[key] = value.encode(self._encoding, self._errors)
|
||||||
return record
|
return record
|
||||||
|
|
||||||
|
|
||||||
###########################
|
###########################
|
||||||
## Commonly Used Filters ##
|
## Commonly Used Filters ##
|
||||||
@ -407,14 +396,14 @@ class PhoneNumberCleaner(FieldFilter):
|
|||||||
|
|
||||||
class DateCleaner(FieldFilter):
|
class DateCleaner(FieldFilter):
|
||||||
""" Filter that cleans dates to match a given format.
|
""" Filter that cleans dates to match a given format.
|
||||||
|
|
||||||
Takes a list of target keys and to and from formats in strftime format.
|
Takes a list of target keys and to and from formats in strftime format.
|
||||||
"""
|
"""
|
||||||
def __init__(self, keys, from_format, to_format):
|
def __init__(self, keys, from_format, to_format):
|
||||||
super(DateCleaner, self).__init__(keys)
|
super(DateCleaner, self).__init__(keys)
|
||||||
self._from_format = from_format
|
self._from_format = from_format
|
||||||
self._to_format = to_format
|
self._to_format = to_format
|
||||||
|
|
||||||
def process_field(self, item):
|
def process_field(self, item):
|
||||||
return time.strftime(self._to_format,
|
return time.strftime(self._to_format,
|
||||||
time.strptime(item, self._from_format))
|
time.strptime(item, self._from_format))
|
||||||
@ -428,21 +417,21 @@ class NameCleaner(Filter):
|
|||||||
would attempt to split 'name' into firstname, middlename, lastname,
|
would attempt to split 'name' into firstname, middlename, lastname,
|
||||||
and suffix columns, and if it did not fit would place it in raw_name
|
and suffix columns, and if it did not fit would place it in raw_name
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# first middle? last suffix?
|
# first middle? last suffix?
|
||||||
FIRST_LAST = re.compile('''^\s*(?:(?P<firstname>\w+)(?:\.?)
|
FIRST_LAST = re.compile('''^\s*(?:(?P<firstname>\w+)(?:\.?)
|
||||||
\s+(?:(?P<middlename>\w+)\.?\s+)?
|
\s+(?:(?P<middlename>\w+)\.?\s+)?
|
||||||
(?P<lastname>[A-Za-z'-]+))
|
(?P<lastname>[A-Za-z'-]+))
|
||||||
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
|
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
|
||||||
\s*$''', re.VERBOSE | re.IGNORECASE)
|
\s*$''', re.VERBOSE | re.IGNORECASE)
|
||||||
|
|
||||||
# last, first middle? suffix?
|
# last, first middle? suffix?
|
||||||
LAST_FIRST = re.compile('''^\s*(?:(?P<lastname>[A-Za-z'-]+),
|
LAST_FIRST = re.compile('''^\s*(?:(?P<lastname>[A-Za-z'-]+),
|
||||||
\s+(?P<firstname>\w+)(?:\.?)
|
\s+(?P<firstname>\w+)(?:\.?)
|
||||||
(?:\s+(?P<middlename>\w+)\.?)?)
|
(?:\s+(?P<middlename>\w+)\.?)?)
|
||||||
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
|
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
|
||||||
\s*$''', re.VERBOSE | re.IGNORECASE)
|
\s*$''', re.VERBOSE | re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, keys, prefix='', formats=None, nomatch_name=None):
|
def __init__(self, keys, prefix='', formats=None, nomatch_name=None):
|
||||||
super(NameCleaner, self).__init__()
|
super(NameCleaner, self).__init__()
|
||||||
self._keys = utils.str_or_list(keys)
|
self._keys = utils.str_or_list(keys)
|
||||||
@ -457,22 +446,22 @@ class NameCleaner(Filter):
|
|||||||
# run for each key (not using a FieldFilter due to multi-field output)
|
# run for each key (not using a FieldFilter due to multi-field output)
|
||||||
for key in self._keys:
|
for key in self._keys:
|
||||||
name = record[key]
|
name = record[key]
|
||||||
|
|
||||||
# check if key matches any formats
|
# check if key matches any formats
|
||||||
for format in self._name_formats:
|
for format in self._name_formats:
|
||||||
match = format.match(name)
|
match = format.match(name)
|
||||||
|
|
||||||
# if there is a match, remove original name and add pieces
|
# if there is a match, remove original name and add pieces
|
||||||
if match:
|
if match:
|
||||||
record.pop(key)
|
record.pop(key)
|
||||||
for k,v in match.groupdict().iteritems():
|
for k,v in match.groupdict().iteritems():
|
||||||
record[self._name_prefix + k] = v
|
record[self._name_prefix + k] = v
|
||||||
break
|
break
|
||||||
|
|
||||||
# if there is no match, move name into nomatch_name
|
# if there is no match, move name into nomatch_name
|
||||||
else:
|
else:
|
||||||
if self._nomatch_name:
|
if self._nomatch_name:
|
||||||
record.pop(key)
|
record.pop(key)
|
||||||
record[self._nomatch_name] = name
|
record[self._nomatch_name] = name
|
||||||
|
|
||||||
return record
|
return record
|
||||||
|
@ -175,7 +175,15 @@ class FilterTestCase(unittest.TestCase):
|
|||||||
{'x':1, 'y':10, 'c':100}]
|
{'x':1, 'y':10, 'c':100}]
|
||||||
self.assert_filter_result(fr, expected_data)
|
self.assert_filter_result(fr, expected_data)
|
||||||
|
|
||||||
|
# splitter
|
||||||
|
# flatteners?
|
||||||
|
# unique
|
||||||
|
# unicode filter
|
||||||
|
# string filter
|
||||||
|
|
||||||
|
# phone # cleaner
|
||||||
|
# date cleaner
|
||||||
|
# name cleaner
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
Loading…
Reference in New Issue
Block a user