From 662b43b0edde2d5405e97a7d6002f23339f1b0db Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Fri, 3 Jun 2011 13:56:37 -0400 Subject: [PATCH 01/11] move import --- saucebrush/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saucebrush/utils.py b/saucebrush/utils.py index da0399f..aa8c241 100644 --- a/saucebrush/utils.py +++ b/saucebrush/utils.py @@ -1,3 +1,4 @@ +import os import urllib2 """ General utilities used within saucebrush that may be useful elsewhere. @@ -93,7 +94,6 @@ class Files(object): return self.linereader() def linereader(self): - import os for path in iter(self.paths): if os.path.exists(path): if self.file_open_callback: From 6bb753d2b1d9290009d467791c974634f280de82 Mon Sep 17 00:00:00 2001 From: James Turk Date: Thu, 4 Aug 2011 11:39:28 -0400 Subject: [PATCH 02/11] LICENSE --- LICENSE | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..1ee0c7c --- /dev/null +++ b/LICENSE @@ -0,0 +1,27 @@ +Copyright (c) 2011, Sunlight Labs + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Sunlight Labs nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From 7e7893109a6e71524e0f67cc53ee17f0172aaf94 Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Wed, 14 Sep 2011 13:22:52 -0400 Subject: [PATCH 03/11] median will now return a float with both even and odd number of values --- saucebrush/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saucebrush/stats.py b/saucebrush/stats.py index b726035..cc5ba83 100644 --- a/saucebrush/stats.py +++ b/saucebrush/stats.py @@ -27,7 +27,7 @@ def _median(values): if count % 2 == 1: # odd number of items, return middle value - return values[count / 2] + return float(values[count / 2]) else: # even number of items, return average of middle two items mid = count / 2 From 42213ff106ba8661879876a0b395b06d437cd816 Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Wed, 14 Sep 2011 13:23:22 -0400 Subject: [PATCH 04/11] add test for median with an even number of values --- saucebrush/tests/stats.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/saucebrush/tests/stats.py b/saucebrush/tests/stats.py index f529e07..cd797a1 100644 --- a/saucebrush/tests/stats.py +++ b/saucebrush/tests/stats.py @@ -19,10 +19,16 @@ class StatsTestCase(unittest.TestCase): self.assertEqual(fltr.value(), 36.0) def test_median(self): + # odd number of values fltr = Median('a') list(fltr.attach(self._simple_data())) self.assertEqual(fltr.value(), 1) - + + # even number of values + fltr = Median('a') + list(fltr.attach(self._simple_data()[:2])) + self.assertEqual(fltr.value(), 3) + def test_minmax(self): fltr = MinMax('b') list(fltr.attach(self._simple_data())) From ddd81c96b116676dd68a007567137f0d5af2f864 Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Wed, 14 Sep 2011 13:49:23 -0400 Subject: [PATCH 05/11] update CountEmitter to take an optional of argument to display 'x of y' --- saucebrush/emitters.py | 21 +++++++++++++++++---- saucebrush/tests/emitters.py | 21 +++++++++++++++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/saucebrush/emitters.py b/saucebrush/emitters.py index a66ac34..1815345 100644 --- a/saucebrush/emitters.py +++ b/saucebrush/emitters.py @@ -61,24 +61,37 @@ class CountEmitter(Emitter): CountEmitter(every=1000000) would write the count every 1,000,000 records. """ - def __init__(self, every=1000, outfile=None, format=None): + def __init__(self, every=1000, of=None, outfile=None, format=None): + super(CountEmitter, self).__init__() + if not outfile: import sys self._outfile = sys.stdout else: self._outfile = outfile - self._format = "%s\n" if format is None else format + + if format is None: + if of is not None: + format = "%(count)s of %(of)s\n" + else: + format = "%(count)s\n" + + self._format = format self._every = every + self._of = of self.count = 0 + + def __str__(self): + return self._format % {'count': self.count, 'of': self._of} def emit_record(self, record): self.count += 1 if self.count % self._every == 0: - self._outfile.write(self._format % self.count) + self._outfile.write(str(self)) def done(self): - self._outfile.write(self._format % self.count) + self._outfile.write(str(self)) class CSVEmitter(Emitter): diff --git a/saucebrush/tests/emitters.py b/saucebrush/tests/emitters.py index 1c8b9d8..3660f9f 100644 --- a/saucebrush/tests/emitters.py +++ b/saucebrush/tests/emitters.py @@ -22,13 +22,26 @@ class EmitterTestCase(unittest.TestCase): self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n') def test_count_emitter(self): - ce = CountEmitter(every=10, outfile=self.output, format="%s records\n") - data = ce.attach([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]) - for _ in data: - pass + + # values for test + values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] + + # test without of parameter + ce = CountEmitter(every=10, outfile=self.output, format="%(count)s records\n") + list(ce.attach(values)) self.assertEquals(self.output.getvalue(), '10 records\n20 records\n') ce.done() self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n') + + # reset output + self.output.truncate(0) + + # test with of parameter + ce = CountEmitter(every=10, outfile=self.output, of=len(values)) + list(ce.attach(values)) + self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n') + ce.done() + self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n') if __name__ == '__main__': unittest.main() From ecbd41808adf6876d98e5dc7ca96b7159130317e Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Wed, 14 Sep 2011 14:21:04 -0400 Subject: [PATCH 06/11] add doc for CountEmitter 'of' parameter --- saucebrush/emitters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/saucebrush/emitters.py b/saucebrush/emitters.py index 1815345..8460984 100644 --- a/saucebrush/emitters.py +++ b/saucebrush/emitters.py @@ -59,6 +59,7 @@ class CountEmitter(Emitter): CountEmitter() by default writes to stdout. CountEmitter(outfile=open('text', 'w')) would print to a file name test. CountEmitter(every=1000000) would write the count every 1,000,000 records. + CountEmitter(every=100, of=2000) would write " of 2000" every 100 records. """ def __init__(self, every=1000, of=None, outfile=None, format=None): From 1587624d2295aca5752734701c92231eae192a26 Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Thu, 15 Sep 2011 11:09:04 -0400 Subject: [PATCH 07/11] add FieldKeeper filter --- saucebrush/filters.py | 18 ++++++++++++++++++ saucebrush/tests/filters.py | 9 ++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/saucebrush/filters.py b/saucebrush/filters.py index 7fee37a..380628f 100644 --- a/saucebrush/filters.py +++ b/saucebrush/filters.py @@ -215,6 +215,24 @@ class FieldModifier(FieldFilter): str(self._target_keys), str(self._filter_func)) +class FieldKeeper(Filter): + """ Filter that removes all but the given set of fields. + + FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs + fields from every record filtered. + """ + + def __init__(self, keys): + super(FieldKeeper, self).__init__() + self._target_keys = utils.str_or_list(keys) + + def process_record(self, record): + for key in record.keys(): + if key not in self._target_keys: + del record[key] + return record + + class FieldRemover(Filter): """ Filter that removes a given set of fields. diff --git a/saucebrush/tests/filters.py b/saucebrush/tests/filters.py index cbd9cd7..d0f3b85 100644 --- a/saucebrush/tests/filters.py +++ b/saucebrush/tests/filters.py @@ -3,7 +3,7 @@ import operator import types from saucebrush.filters import (Filter, YieldFilter, FieldFilter, SubrecordFilter, ConditionalPathFilter, - ConditionalFilter, FieldModifier, + ConditionalFilter, FieldModifier, FieldKeeper, FieldRemover, FieldMerger, FieldAdder, FieldCopier, FieldRenamer, Unique) @@ -200,6 +200,13 @@ class FilterTestCase(unittest.TestCase): {'a':2, 'b':10, 'c':200}] self.assert_filter_result(fm, expected_data) + def test_field_keeper(self): + fk = FieldKeeper(['c']) + + # check against expected results + expected_data = [{'c':3}, {'c':5}, {'c':100}] + self.assert_filter_result(fk, expected_data) + def test_field_remover(self): fr = FieldRemover(['a', 'b']) From cdcee896581cf82441dd8557cbfdbe1f484f6934 Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Mon, 19 Sep 2011 14:01:36 -0400 Subject: [PATCH 08/11] add basic histogram stats filter --- saucebrush/stats.py | 30 ++++++++++++++++++++++++++++-- saucebrush/tests/stats.py | 8 +++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/saucebrush/stats.py b/saucebrush/stats.py index cc5ba83..796d3fe 100644 --- a/saucebrush/stats.py +++ b/saucebrush/stats.py @@ -7,7 +7,9 @@ def _average(values): :param values: an iterable of ints or floats to average """ - return sum(values) / float(len(values)) + value_count = len(values) + if len(values) > 0: + return sum(values) / float(value_count) def _median(values): """ Calculate the median of a list of values. @@ -174,4 +176,28 @@ class StandardDeviation(StatsFilter): :param population: True if values represents entire population, False if values is a sample. Default: False """ - return _stddev(self._values, population) \ No newline at end of file + return _stddev(self._values, population) + +class Histogram(StatsFilter): + + label_length = 6 + + def __init__(self, field, **kwargs): + super(Histogram, self).__init__(field, **kwargs) + self._data = {} + + def process_field(self, item): + if item not in self._data: + self._data[item] = 0 + self._data[item] += 1 + + def value(self): + return self._data.copy() + + def __str__(self): + output = "" + for key in sorted(self._data.keys()): + key_str = str(key).ljust(self.label_length)[:self.label_length] + output += "%s %s\n" % (key_str, "*" * self._data[key]) + return output + \ No newline at end of file diff --git a/saucebrush/tests/stats.py b/saucebrush/tests/stats.py index cd797a1..5040ddf 100644 --- a/saucebrush/tests/stats.py +++ b/saucebrush/tests/stats.py @@ -1,5 +1,5 @@ import unittest -from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation +from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram class StatsTestCase(unittest.TestCase): @@ -41,6 +41,12 @@ class StatsTestCase(unittest.TestCase): self.assertEqual(fltr.median(), 5) self.assertEqual(fltr.value(), (55.4346462061408, 3073.0)) self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665)) + + def test_histogram(self): + fltr = Histogram('a') + fltr.label_length = 1 + list(fltr.attach(self._simple_data())) + self.assertEqual(str(fltr), "1 **\n5 *\n") if __name__ == '__main__': unittest.main() From cc42012dafe24aa1aa0e0b4f741573a3319e049c Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Mon, 19 Sep 2011 14:07:03 -0400 Subject: [PATCH 09/11] add class docs to Histogram --- saucebrush/stats.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/saucebrush/stats.py b/saucebrush/stats.py index 796d3fe..94e46e2 100644 --- a/saucebrush/stats.py +++ b/saucebrush/stats.py @@ -179,6 +179,14 @@ class StandardDeviation(StatsFilter): return _stddev(self._values, population) class Histogram(StatsFilter): + """ Generate a basic histogram of the specified field. The value() method + returns a dict of value to occurance count mappings. The __str__ method + generates a basic and limited histogram useful for printing to the + command line. The label_length attribute determines the padding and + cut-off of the basic histogram labels. + + **This filters maintains a dict of unique field values in memory.** + """ label_length = 6 From 638183b5626cdd159b52c9446db88d23dd782a7e Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Mon, 19 Sep 2011 14:41:54 -0400 Subject: [PATCH 10/11] add prep_field method to histogram to allow for custom labeling and grouping of values --- saucebrush/stats.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/saucebrush/stats.py b/saucebrush/stats.py index 94e46e2..4099b71 100644 --- a/saucebrush/stats.py +++ b/saucebrush/stats.py @@ -195,10 +195,14 @@ class Histogram(StatsFilter): self._data = {} def process_field(self, item): + item = self.prep_field(item) if item not in self._data: self._data[item] = 0 self._data[item] += 1 + def prep_field(self, item): + return item + def value(self): return self._data.copy() @@ -208,4 +212,3 @@ class Histogram(StatsFilter): key_str = str(key).ljust(self.label_length)[:self.label_length] output += "%s %s\n" % (key_str, "*" * self._data[key]) return output - \ No newline at end of file From ac096b862fc1bd2397e7eb1df9cd4dc79a680efc Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Mon, 19 Sep 2011 15:23:02 -0400 Subject: [PATCH 11/11] add some unncessary complexity to Histogram for sorting values alphabetically and by most common --- saucebrush/stats.py | 33 ++++++++++++++++++++++----------- saucebrush/tests/stats.py | 2 +- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/saucebrush/stats.py b/saucebrush/stats.py index 4099b71..0eed9fd 100644 --- a/saucebrush/stats.py +++ b/saucebrush/stats.py @@ -1,4 +1,5 @@ from saucebrush.filters import Filter +import collections import itertools import math @@ -192,23 +193,33 @@ class Histogram(StatsFilter): def __init__(self, field, **kwargs): super(Histogram, self).__init__(field, **kwargs) - self._data = {} + self._counter = collections.Counter() def process_field(self, item): - item = self.prep_field(item) - if item not in self._data: - self._data[item] = 0 - self._data[item] += 1 + self._counter[self.prep_field(item)] += 1 def prep_field(self, item): return item def value(self): - return self._data.copy() + return self._counter.copy() + + def in_order(self): + ordered = [] + for key in sorted(self._counter.keys()): + ordered.append((key, self._counter[key])) + return ordered + + def most_common(self, n=None): + return self._counter.most_common(n) + + @classmethod + def as_string(self, occurences, label_length): + output = "\n" + for key, count in occurences: + key_str = str(key).ljust(label_length)[:label_length] + output += "%s %s\n" % (key_str, "*" * count) + return output def __str__(self): - output = "" - for key in sorted(self._data.keys()): - key_str = str(key).ljust(self.label_length)[:self.label_length] - output += "%s %s\n" % (key_str, "*" * self._data[key]) - return output + return Histogram.as_string(self.in_order(), label_length=self.label_length) diff --git a/saucebrush/tests/stats.py b/saucebrush/tests/stats.py index 5040ddf..37a2933 100644 --- a/saucebrush/tests/stats.py +++ b/saucebrush/tests/stats.py @@ -46,7 +46,7 @@ class StatsTestCase(unittest.TestCase): fltr = Histogram('a') fltr.label_length = 1 list(fltr.attach(self._simple_data())) - self.assertEqual(str(fltr), "1 **\n5 *\n") + self.assertEqual(str(fltr), "\n1 **\n5 *\n") if __name__ == '__main__': unittest.main()