Merge branch 'master' of github.com:sunlightlabs/saucebrush
This commit is contained in:
commit
196259e6c0
27
LICENSE
Normal file
27
LICENSE
Normal file
@ -0,0 +1,27 @@
|
||||
Copyright (c) 2011, Sunlight Labs
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of Sunlight Labs nor the names of its contributors may be
|
||||
used to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@ -59,26 +59,40 @@ class CountEmitter(Emitter):
|
||||
CountEmitter() by default writes to stdout.
|
||||
CountEmitter(outfile=open('text', 'w')) would print to a file name test.
|
||||
CountEmitter(every=1000000) would write the count every 1,000,000 records.
|
||||
CountEmitter(every=100, of=2000) would write "<count> of 2000" every 100 records.
|
||||
"""
|
||||
|
||||
def __init__(self, every=1000, outfile=None, format=None):
|
||||
def __init__(self, every=1000, of=None, outfile=None, format=None):
|
||||
|
||||
super(CountEmitter, self).__init__()
|
||||
|
||||
if not outfile:
|
||||
import sys
|
||||
self._outfile = sys.stdout
|
||||
else:
|
||||
self._outfile = outfile
|
||||
self._format = "%s\n" if format is None else format
|
||||
|
||||
if format is None:
|
||||
if of is not None:
|
||||
format = "%(count)s of %(of)s\n"
|
||||
else:
|
||||
format = "%(count)s\n"
|
||||
|
||||
self._format = format
|
||||
self._every = every
|
||||
self._of = of
|
||||
self.count = 0
|
||||
|
||||
def __str__(self):
|
||||
return self._format % {'count': self.count, 'of': self._of}
|
||||
|
||||
def emit_record(self, record):
|
||||
self.count += 1
|
||||
if self.count % self._every == 0:
|
||||
self._outfile.write(self._format % self.count)
|
||||
self._outfile.write(str(self))
|
||||
|
||||
def done(self):
|
||||
self._outfile.write(self._format % self.count)
|
||||
self._outfile.write(str(self))
|
||||
|
||||
|
||||
class CSVEmitter(Emitter):
|
||||
|
@ -215,6 +215,24 @@ class FieldModifier(FieldFilter):
|
||||
str(self._target_keys), str(self._filter_func))
|
||||
|
||||
|
||||
class FieldKeeper(Filter):
|
||||
""" Filter that removes all but the given set of fields.
|
||||
|
||||
FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs
|
||||
fields from every record filtered.
|
||||
"""
|
||||
|
||||
def __init__(self, keys):
|
||||
super(FieldKeeper, self).__init__()
|
||||
self._target_keys = utils.str_or_list(keys)
|
||||
|
||||
def process_record(self, record):
|
||||
for key in record.keys():
|
||||
if key not in self._target_keys:
|
||||
del record[key]
|
||||
return record
|
||||
|
||||
|
||||
class FieldRemover(Filter):
|
||||
""" Filter that removes a given set of fields.
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
from saucebrush.filters import Filter
|
||||
import collections
|
||||
import itertools
|
||||
import math
|
||||
|
||||
@ -7,7 +8,9 @@ def _average(values):
|
||||
|
||||
:param values: an iterable of ints or floats to average
|
||||
"""
|
||||
return sum(values) / float(len(values))
|
||||
value_count = len(values)
|
||||
if len(values) > 0:
|
||||
return sum(values) / float(value_count)
|
||||
|
||||
def _median(values):
|
||||
""" Calculate the median of a list of values.
|
||||
@ -27,7 +30,7 @@ def _median(values):
|
||||
|
||||
if count % 2 == 1:
|
||||
# odd number of items, return middle value
|
||||
return values[count / 2]
|
||||
return float(values[count / 2])
|
||||
else:
|
||||
# even number of items, return average of middle two items
|
||||
mid = count / 2
|
||||
@ -174,4 +177,49 @@ class StandardDeviation(StatsFilter):
|
||||
:param population: True if values represents entire population,
|
||||
False if values is a sample. Default: False
|
||||
"""
|
||||
return _stddev(self._values, population)
|
||||
return _stddev(self._values, population)
|
||||
|
||||
class Histogram(StatsFilter):
|
||||
""" Generate a basic histogram of the specified field. The value() method
|
||||
returns a dict of value to occurance count mappings. The __str__ method
|
||||
generates a basic and limited histogram useful for printing to the
|
||||
command line. The label_length attribute determines the padding and
|
||||
cut-off of the basic histogram labels.
|
||||
|
||||
**This filters maintains a dict of unique field values in memory.**
|
||||
"""
|
||||
|
||||
label_length = 6
|
||||
|
||||
def __init__(self, field, **kwargs):
|
||||
super(Histogram, self).__init__(field, **kwargs)
|
||||
self._counter = collections.Counter()
|
||||
|
||||
def process_field(self, item):
|
||||
self._counter[self.prep_field(item)] += 1
|
||||
|
||||
def prep_field(self, item):
|
||||
return item
|
||||
|
||||
def value(self):
|
||||
return self._counter.copy()
|
||||
|
||||
def in_order(self):
|
||||
ordered = []
|
||||
for key in sorted(self._counter.keys()):
|
||||
ordered.append((key, self._counter[key]))
|
||||
return ordered
|
||||
|
||||
def most_common(self, n=None):
|
||||
return self._counter.most_common(n)
|
||||
|
||||
@classmethod
|
||||
def as_string(self, occurences, label_length):
|
||||
output = "\n"
|
||||
for key, count in occurences:
|
||||
key_str = str(key).ljust(label_length)[:label_length]
|
||||
output += "%s %s\n" % (key_str, "*" * count)
|
||||
return output
|
||||
|
||||
def __str__(self):
|
||||
return Histogram.as_string(self.in_order(), label_length=self.label_length)
|
||||
|
@ -22,13 +22,26 @@ class EmitterTestCase(unittest.TestCase):
|
||||
self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n')
|
||||
|
||||
def test_count_emitter(self):
|
||||
ce = CountEmitter(every=10, outfile=self.output, format="%s records\n")
|
||||
data = ce.attach([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])
|
||||
for _ in data:
|
||||
pass
|
||||
|
||||
# values for test
|
||||
values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
|
||||
|
||||
# test without of parameter
|
||||
ce = CountEmitter(every=10, outfile=self.output, format="%(count)s records\n")
|
||||
list(ce.attach(values))
|
||||
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n')
|
||||
ce.done()
|
||||
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n')
|
||||
|
||||
# reset output
|
||||
self.output.truncate(0)
|
||||
|
||||
# test with of parameter
|
||||
ce = CountEmitter(every=10, outfile=self.output, of=len(values))
|
||||
list(ce.attach(values))
|
||||
self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n')
|
||||
ce.done()
|
||||
self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@ -3,7 +3,7 @@ import operator
|
||||
import types
|
||||
from saucebrush.filters import (Filter, YieldFilter, FieldFilter,
|
||||
SubrecordFilter, ConditionalPathFilter,
|
||||
ConditionalFilter, FieldModifier,
|
||||
ConditionalFilter, FieldModifier, FieldKeeper,
|
||||
FieldRemover, FieldMerger, FieldAdder,
|
||||
FieldCopier, FieldRenamer, Unique)
|
||||
|
||||
@ -200,6 +200,13 @@ class FilterTestCase(unittest.TestCase):
|
||||
{'a':2, 'b':10, 'c':200}]
|
||||
self.assert_filter_result(fm, expected_data)
|
||||
|
||||
def test_field_keeper(self):
|
||||
fk = FieldKeeper(['c'])
|
||||
|
||||
# check against expected results
|
||||
expected_data = [{'c':3}, {'c':5}, {'c':100}]
|
||||
self.assert_filter_result(fk, expected_data)
|
||||
|
||||
def test_field_remover(self):
|
||||
fr = FieldRemover(['a', 'b'])
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
import unittest
|
||||
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation
|
||||
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram
|
||||
|
||||
class StatsTestCase(unittest.TestCase):
|
||||
|
||||
@ -19,10 +19,16 @@ class StatsTestCase(unittest.TestCase):
|
||||
self.assertEqual(fltr.value(), 36.0)
|
||||
|
||||
def test_median(self):
|
||||
# odd number of values
|
||||
fltr = Median('a')
|
||||
list(fltr.attach(self._simple_data()))
|
||||
self.assertEqual(fltr.value(), 1)
|
||||
|
||||
|
||||
# even number of values
|
||||
fltr = Median('a')
|
||||
list(fltr.attach(self._simple_data()[:2]))
|
||||
self.assertEqual(fltr.value(), 3)
|
||||
|
||||
def test_minmax(self):
|
||||
fltr = MinMax('b')
|
||||
list(fltr.attach(self._simple_data()))
|
||||
@ -35,6 +41,12 @@ class StatsTestCase(unittest.TestCase):
|
||||
self.assertEqual(fltr.median(), 5)
|
||||
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
|
||||
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
|
||||
|
||||
def test_histogram(self):
|
||||
fltr = Histogram('a')
|
||||
fltr.label_length = 1
|
||||
list(fltr.attach(self._simple_data()))
|
||||
self.assertEqual(str(fltr), "\n1 **\n5 *\n")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@ -1,3 +1,4 @@
|
||||
import os
|
||||
import urllib2
|
||||
"""
|
||||
General utilities used within saucebrush that may be useful elsewhere.
|
||||
@ -93,7 +94,6 @@ class Files(object):
|
||||
return self.linereader()
|
||||
|
||||
def linereader(self):
|
||||
import os
|
||||
for path in iter(self.paths):
|
||||
if os.path.exists(path):
|
||||
if self.file_open_callback:
|
||||
|
Loading…
Reference in New Issue
Block a user