diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1ee0c7c --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011, Sunlight Labs + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Sunlight Labs nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/saucebrush/emitters.py b/saucebrush/emitters.py index a66ac34..8460984 100644 --- a/saucebrush/emitters.py +++ b/saucebrush/emitters.py @@ -59,26 +59,40 @@ class CountEmitter(Emitter): CountEmitter() by default writes to stdout. CountEmitter(outfile=open('text', 'w')) would print to a file name test. CountEmitter(every=1000000) would write the count every 1,000,000 records. + CountEmitter(every=100, of=2000) would write " of 2000" every 100 records. """ - def __init__(self, every=1000, outfile=None, format=None): + def __init__(self, every=1000, of=None, outfile=None, format=None): + super(CountEmitter, self).__init__() + if not outfile: import sys self._outfile = sys.stdout else: self._outfile = outfile - self._format = "%s\n" if format is None else format + + if format is None: + if of is not None: + format = "%(count)s of %(of)s\n" + else: + format = "%(count)s\n" + + self._format = format self._every = every + self._of = of self.count = 0 + + def __str__(self): + return self._format % {'count': self.count, 'of': self._of} def emit_record(self, record): self.count += 1 if self.count % self._every == 0: - self._outfile.write(self._format % self.count) + self._outfile.write(str(self)) def done(self): - self._outfile.write(self._format % self.count) + self._outfile.write(str(self)) class CSVEmitter(Emitter): diff --git a/saucebrush/filters.py b/saucebrush/filters.py index 7fee37a..380628f 100644 --- a/saucebrush/filters.py +++ b/saucebrush/filters.py @@ -215,6 +215,24 @@ class FieldModifier(FieldFilter): str(self._target_keys), str(self._filter_func)) +class FieldKeeper(Filter): + """ Filter that removes all but the given set of fields. + + FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs + fields from every record filtered. + """ + + def __init__(self, keys): + super(FieldKeeper, self).__init__() + self._target_keys = utils.str_or_list(keys) + + def process_record(self, record): + for key in record.keys(): + if key not in self._target_keys: + del record[key] + return record + + class FieldRemover(Filter): """ Filter that removes a given set of fields. diff --git a/saucebrush/stats.py b/saucebrush/stats.py index b726035..0eed9fd 100644 --- a/saucebrush/stats.py +++ b/saucebrush/stats.py @@ -1,4 +1,5 @@ from saucebrush.filters import Filter +import collections import itertools import math @@ -7,7 +8,9 @@ def _average(values): :param values: an iterable of ints or floats to average """ - return sum(values) / float(len(values)) + value_count = len(values) + if len(values) > 0: + return sum(values) / float(value_count) def _median(values): """ Calculate the median of a list of values. @@ -27,7 +30,7 @@ def _median(values): if count % 2 == 1: # odd number of items, return middle value - return values[count / 2] + return float(values[count / 2]) else: # even number of items, return average of middle two items mid = count / 2 @@ -174,4 +177,49 @@ class StandardDeviation(StatsFilter): :param population: True if values represents entire population, False if values is a sample. Default: False """ - return _stddev(self._values, population) \ No newline at end of file + return _stddev(self._values, population) + +class Histogram(StatsFilter): + """ Generate a basic histogram of the specified field. The value() method + returns a dict of value to occurance count mappings. The __str__ method + generates a basic and limited histogram useful for printing to the + command line. The label_length attribute determines the padding and + cut-off of the basic histogram labels. + + **This filters maintains a dict of unique field values in memory.** + """ + + label_length = 6 + + def __init__(self, field, **kwargs): + super(Histogram, self).__init__(field, **kwargs) + self._counter = collections.Counter() + + def process_field(self, item): + self._counter[self.prep_field(item)] += 1 + + def prep_field(self, item): + return item + + def value(self): + return self._counter.copy() + + def in_order(self): + ordered = [] + for key in sorted(self._counter.keys()): + ordered.append((key, self._counter[key])) + return ordered + + def most_common(self, n=None): + return self._counter.most_common(n) + + @classmethod + def as_string(self, occurences, label_length): + output = "\n" + for key, count in occurences: + key_str = str(key).ljust(label_length)[:label_length] + output += "%s %s\n" % (key_str, "*" * count) + return output + + def __str__(self): + return Histogram.as_string(self.in_order(), label_length=self.label_length) diff --git a/saucebrush/tests/emitters.py b/saucebrush/tests/emitters.py index 1c8b9d8..3660f9f 100644 --- a/saucebrush/tests/emitters.py +++ b/saucebrush/tests/emitters.py @@ -22,13 +22,26 @@ class EmitterTestCase(unittest.TestCase): self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n') def test_count_emitter(self): - ce = CountEmitter(every=10, outfile=self.output, format="%s records\n") - data = ce.attach([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]) - for _ in data: - pass + + # values for test + values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] + + # test without of parameter + ce = CountEmitter(every=10, outfile=self.output, format="%(count)s records\n") + list(ce.attach(values)) self.assertEquals(self.output.getvalue(), '10 records\n20 records\n') ce.done() self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n') + + # reset output + self.output.truncate(0) + + # test with of parameter + ce = CountEmitter(every=10, outfile=self.output, of=len(values)) + list(ce.attach(values)) + self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n') + ce.done() + self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n') if __name__ == '__main__': unittest.main() diff --git a/saucebrush/tests/filters.py b/saucebrush/tests/filters.py index cbd9cd7..d0f3b85 100644 --- a/saucebrush/tests/filters.py +++ b/saucebrush/tests/filters.py @@ -3,7 +3,7 @@ import operator import types from saucebrush.filters import (Filter, YieldFilter, FieldFilter, SubrecordFilter, ConditionalPathFilter, - ConditionalFilter, FieldModifier, + ConditionalFilter, FieldModifier, FieldKeeper, FieldRemover, FieldMerger, FieldAdder, FieldCopier, FieldRenamer, Unique) @@ -200,6 +200,13 @@ class FilterTestCase(unittest.TestCase): {'a':2, 'b':10, 'c':200}] self.assert_filter_result(fm, expected_data) + def test_field_keeper(self): + fk = FieldKeeper(['c']) + + # check against expected results + expected_data = [{'c':3}, {'c':5}, {'c':100}] + self.assert_filter_result(fk, expected_data) + def test_field_remover(self): fr = FieldRemover(['a', 'b']) diff --git a/saucebrush/tests/stats.py b/saucebrush/tests/stats.py index f529e07..37a2933 100644 --- a/saucebrush/tests/stats.py +++ b/saucebrush/tests/stats.py @@ -1,5 +1,5 @@ import unittest -from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation +from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram class StatsTestCase(unittest.TestCase): @@ -19,10 +19,16 @@ class StatsTestCase(unittest.TestCase): self.assertEqual(fltr.value(), 36.0) def test_median(self): + # odd number of values fltr = Median('a') list(fltr.attach(self._simple_data())) self.assertEqual(fltr.value(), 1) - + + # even number of values + fltr = Median('a') + list(fltr.attach(self._simple_data()[:2])) + self.assertEqual(fltr.value(), 3) + def test_minmax(self): fltr = MinMax('b') list(fltr.attach(self._simple_data())) @@ -35,6 +41,12 @@ class StatsTestCase(unittest.TestCase): self.assertEqual(fltr.median(), 5) self.assertEqual(fltr.value(), (55.4346462061408, 3073.0)) self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665)) + + def test_histogram(self): + fltr = Histogram('a') + fltr.label_length = 1 + list(fltr.attach(self._simple_data())) + self.assertEqual(str(fltr), "\n1 **\n5 *\n") if __name__ == '__main__': unittest.main() diff --git a/saucebrush/utils.py b/saucebrush/utils.py index da0399f..aa8c241 100644 --- a/saucebrush/utils.py +++ b/saucebrush/utils.py @@ -1,3 +1,4 @@ +import os import urllib2 """ General utilities used within saucebrush that may be useful elsewhere. @@ -93,7 +94,6 @@ class Files(object): return self.linereader() def linereader(self): - import os for path in iter(self.paths): if os.path.exists(path): if self.file_open_callback: