Merge branch 'master' of github.com:sunlightlabs/saucebrush

This commit is contained in:
James Turk 2012-01-06 22:52:38 -05:00
commit 196259e6c0
8 changed files with 154 additions and 15 deletions

27
LICENSE Normal file
View File

@ -0,0 +1,27 @@
Copyright (c) 2011, Sunlight Labs
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of Sunlight Labs nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -59,26 +59,40 @@ class CountEmitter(Emitter):
CountEmitter() by default writes to stdout.
CountEmitter(outfile=open('text', 'w')) would print to a file name test.
CountEmitter(every=1000000) would write the count every 1,000,000 records.
CountEmitter(every=100, of=2000) would write "<count> of 2000" every 100 records.
"""
def __init__(self, every=1000, outfile=None, format=None):
def __init__(self, every=1000, of=None, outfile=None, format=None):
super(CountEmitter, self).__init__()
if not outfile:
import sys
self._outfile = sys.stdout
else:
self._outfile = outfile
self._format = "%s\n" if format is None else format
if format is None:
if of is not None:
format = "%(count)s of %(of)s\n"
else:
format = "%(count)s\n"
self._format = format
self._every = every
self._of = of
self.count = 0
def __str__(self):
return self._format % {'count': self.count, 'of': self._of}
def emit_record(self, record):
self.count += 1
if self.count % self._every == 0:
self._outfile.write(self._format % self.count)
self._outfile.write(str(self))
def done(self):
self._outfile.write(self._format % self.count)
self._outfile.write(str(self))
class CSVEmitter(Emitter):

View File

@ -215,6 +215,24 @@ class FieldModifier(FieldFilter):
str(self._target_keys), str(self._filter_func))
class FieldKeeper(Filter):
""" Filter that removes all but the given set of fields.
FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs
fields from every record filtered.
"""
def __init__(self, keys):
super(FieldKeeper, self).__init__()
self._target_keys = utils.str_or_list(keys)
def process_record(self, record):
for key in record.keys():
if key not in self._target_keys:
del record[key]
return record
class FieldRemover(Filter):
""" Filter that removes a given set of fields.

View File

@ -1,4 +1,5 @@
from saucebrush.filters import Filter
import collections
import itertools
import math
@ -7,7 +8,9 @@ def _average(values):
:param values: an iterable of ints or floats to average
"""
return sum(values) / float(len(values))
value_count = len(values)
if len(values) > 0:
return sum(values) / float(value_count)
def _median(values):
""" Calculate the median of a list of values.
@ -27,7 +30,7 @@ def _median(values):
if count % 2 == 1:
# odd number of items, return middle value
return values[count / 2]
return float(values[count / 2])
else:
# even number of items, return average of middle two items
mid = count / 2
@ -174,4 +177,49 @@ class StandardDeviation(StatsFilter):
:param population: True if values represents entire population,
False if values is a sample. Default: False
"""
return _stddev(self._values, population)
return _stddev(self._values, population)
class Histogram(StatsFilter):
""" Generate a basic histogram of the specified field. The value() method
returns a dict of value to occurance count mappings. The __str__ method
generates a basic and limited histogram useful for printing to the
command line. The label_length attribute determines the padding and
cut-off of the basic histogram labels.
**This filters maintains a dict of unique field values in memory.**
"""
label_length = 6
def __init__(self, field, **kwargs):
super(Histogram, self).__init__(field, **kwargs)
self._counter = collections.Counter()
def process_field(self, item):
self._counter[self.prep_field(item)] += 1
def prep_field(self, item):
return item
def value(self):
return self._counter.copy()
def in_order(self):
ordered = []
for key in sorted(self._counter.keys()):
ordered.append((key, self._counter[key]))
return ordered
def most_common(self, n=None):
return self._counter.most_common(n)
@classmethod
def as_string(self, occurences, label_length):
output = "\n"
for key, count in occurences:
key_str = str(key).ljust(label_length)[:label_length]
output += "%s %s\n" % (key_str, "*" * count)
return output
def __str__(self):
return Histogram.as_string(self.in_order(), label_length=self.label_length)

View File

@ -22,13 +22,26 @@ class EmitterTestCase(unittest.TestCase):
self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n')
def test_count_emitter(self):
ce = CountEmitter(every=10, outfile=self.output, format="%s records\n")
data = ce.attach([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])
for _ in data:
pass
# values for test
values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
# test without of parameter
ce = CountEmitter(every=10, outfile=self.output, format="%(count)s records\n")
list(ce.attach(values))
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n')
ce.done()
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n')
# reset output
self.output.truncate(0)
# test with of parameter
ce = CountEmitter(every=10, outfile=self.output, of=len(values))
list(ce.attach(values))
self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n')
ce.done()
self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n')
if __name__ == '__main__':
unittest.main()

View File

@ -3,7 +3,7 @@ import operator
import types
from saucebrush.filters import (Filter, YieldFilter, FieldFilter,
SubrecordFilter, ConditionalPathFilter,
ConditionalFilter, FieldModifier,
ConditionalFilter, FieldModifier, FieldKeeper,
FieldRemover, FieldMerger, FieldAdder,
FieldCopier, FieldRenamer, Unique)
@ -200,6 +200,13 @@ class FilterTestCase(unittest.TestCase):
{'a':2, 'b':10, 'c':200}]
self.assert_filter_result(fm, expected_data)
def test_field_keeper(self):
fk = FieldKeeper(['c'])
# check against expected results
expected_data = [{'c':3}, {'c':5}, {'c':100}]
self.assert_filter_result(fk, expected_data)
def test_field_remover(self):
fr = FieldRemover(['a', 'b'])

View File

@ -1,5 +1,5 @@
import unittest
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram
class StatsTestCase(unittest.TestCase):
@ -19,10 +19,16 @@ class StatsTestCase(unittest.TestCase):
self.assertEqual(fltr.value(), 36.0)
def test_median(self):
# odd number of values
fltr = Median('a')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 1)
# even number of values
fltr = Median('a')
list(fltr.attach(self._simple_data()[:2]))
self.assertEqual(fltr.value(), 3)
def test_minmax(self):
fltr = MinMax('b')
list(fltr.attach(self._simple_data()))
@ -35,6 +41,12 @@ class StatsTestCase(unittest.TestCase):
self.assertEqual(fltr.median(), 5)
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
def test_histogram(self):
fltr = Histogram('a')
fltr.label_length = 1
list(fltr.attach(self._simple_data()))
self.assertEqual(str(fltr), "\n1 **\n5 *\n")
if __name__ == '__main__':
unittest.main()

View File

@ -1,3 +1,4 @@
import os
import urllib2
"""
General utilities used within saucebrush that may be useful elsewhere.
@ -93,7 +94,6 @@ class Files(object):
return self.linereader()
def linereader(self):
import os
for path in iter(self.paths):
if os.path.exists(path):
if self.file_open_callback: