Merge branch 'master' of github.com:sunlightlabs/saucebrush
This commit is contained in:
commit
196259e6c0
27
LICENSE
Normal file
27
LICENSE
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
Copyright (c) 2011, Sunlight Labs
|
||||||
|
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without modification,
|
||||||
|
are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer in the documentation
|
||||||
|
and/or other materials provided with the distribution.
|
||||||
|
* Neither the name of Sunlight Labs nor the names of its contributors may be
|
||||||
|
used to endorse or promote products derived from this software without
|
||||||
|
specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||||
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
@ -59,26 +59,40 @@ class CountEmitter(Emitter):
|
|||||||
CountEmitter() by default writes to stdout.
|
CountEmitter() by default writes to stdout.
|
||||||
CountEmitter(outfile=open('text', 'w')) would print to a file name test.
|
CountEmitter(outfile=open('text', 'w')) would print to a file name test.
|
||||||
CountEmitter(every=1000000) would write the count every 1,000,000 records.
|
CountEmitter(every=1000000) would write the count every 1,000,000 records.
|
||||||
|
CountEmitter(every=100, of=2000) would write "<count> of 2000" every 100 records.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, every=1000, outfile=None, format=None):
|
def __init__(self, every=1000, of=None, outfile=None, format=None):
|
||||||
|
|
||||||
super(CountEmitter, self).__init__()
|
super(CountEmitter, self).__init__()
|
||||||
|
|
||||||
if not outfile:
|
if not outfile:
|
||||||
import sys
|
import sys
|
||||||
self._outfile = sys.stdout
|
self._outfile = sys.stdout
|
||||||
else:
|
else:
|
||||||
self._outfile = outfile
|
self._outfile = outfile
|
||||||
self._format = "%s\n" if format is None else format
|
|
||||||
|
if format is None:
|
||||||
|
if of is not None:
|
||||||
|
format = "%(count)s of %(of)s\n"
|
||||||
|
else:
|
||||||
|
format = "%(count)s\n"
|
||||||
|
|
||||||
|
self._format = format
|
||||||
self._every = every
|
self._every = every
|
||||||
|
self._of = of
|
||||||
self.count = 0
|
self.count = 0
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self._format % {'count': self.count, 'of': self._of}
|
||||||
|
|
||||||
def emit_record(self, record):
|
def emit_record(self, record):
|
||||||
self.count += 1
|
self.count += 1
|
||||||
if self.count % self._every == 0:
|
if self.count % self._every == 0:
|
||||||
self._outfile.write(self._format % self.count)
|
self._outfile.write(str(self))
|
||||||
|
|
||||||
def done(self):
|
def done(self):
|
||||||
self._outfile.write(self._format % self.count)
|
self._outfile.write(str(self))
|
||||||
|
|
||||||
|
|
||||||
class CSVEmitter(Emitter):
|
class CSVEmitter(Emitter):
|
||||||
|
@ -215,6 +215,24 @@ class FieldModifier(FieldFilter):
|
|||||||
str(self._target_keys), str(self._filter_func))
|
str(self._target_keys), str(self._filter_func))
|
||||||
|
|
||||||
|
|
||||||
|
class FieldKeeper(Filter):
|
||||||
|
""" Filter that removes all but the given set of fields.
|
||||||
|
|
||||||
|
FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs
|
||||||
|
fields from every record filtered.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, keys):
|
||||||
|
super(FieldKeeper, self).__init__()
|
||||||
|
self._target_keys = utils.str_or_list(keys)
|
||||||
|
|
||||||
|
def process_record(self, record):
|
||||||
|
for key in record.keys():
|
||||||
|
if key not in self._target_keys:
|
||||||
|
del record[key]
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
class FieldRemover(Filter):
|
class FieldRemover(Filter):
|
||||||
""" Filter that removes a given set of fields.
|
""" Filter that removes a given set of fields.
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from saucebrush.filters import Filter
|
from saucebrush.filters import Filter
|
||||||
|
import collections
|
||||||
import itertools
|
import itertools
|
||||||
import math
|
import math
|
||||||
|
|
||||||
@ -7,7 +8,9 @@ def _average(values):
|
|||||||
|
|
||||||
:param values: an iterable of ints or floats to average
|
:param values: an iterable of ints or floats to average
|
||||||
"""
|
"""
|
||||||
return sum(values) / float(len(values))
|
value_count = len(values)
|
||||||
|
if len(values) > 0:
|
||||||
|
return sum(values) / float(value_count)
|
||||||
|
|
||||||
def _median(values):
|
def _median(values):
|
||||||
""" Calculate the median of a list of values.
|
""" Calculate the median of a list of values.
|
||||||
@ -27,7 +30,7 @@ def _median(values):
|
|||||||
|
|
||||||
if count % 2 == 1:
|
if count % 2 == 1:
|
||||||
# odd number of items, return middle value
|
# odd number of items, return middle value
|
||||||
return values[count / 2]
|
return float(values[count / 2])
|
||||||
else:
|
else:
|
||||||
# even number of items, return average of middle two items
|
# even number of items, return average of middle two items
|
||||||
mid = count / 2
|
mid = count / 2
|
||||||
@ -175,3 +178,48 @@ class StandardDeviation(StatsFilter):
|
|||||||
False if values is a sample. Default: False
|
False if values is a sample. Default: False
|
||||||
"""
|
"""
|
||||||
return _stddev(self._values, population)
|
return _stddev(self._values, population)
|
||||||
|
|
||||||
|
class Histogram(StatsFilter):
|
||||||
|
""" Generate a basic histogram of the specified field. The value() method
|
||||||
|
returns a dict of value to occurance count mappings. The __str__ method
|
||||||
|
generates a basic and limited histogram useful for printing to the
|
||||||
|
command line. The label_length attribute determines the padding and
|
||||||
|
cut-off of the basic histogram labels.
|
||||||
|
|
||||||
|
**This filters maintains a dict of unique field values in memory.**
|
||||||
|
"""
|
||||||
|
|
||||||
|
label_length = 6
|
||||||
|
|
||||||
|
def __init__(self, field, **kwargs):
|
||||||
|
super(Histogram, self).__init__(field, **kwargs)
|
||||||
|
self._counter = collections.Counter()
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
self._counter[self.prep_field(item)] += 1
|
||||||
|
|
||||||
|
def prep_field(self, item):
|
||||||
|
return item
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
return self._counter.copy()
|
||||||
|
|
||||||
|
def in_order(self):
|
||||||
|
ordered = []
|
||||||
|
for key in sorted(self._counter.keys()):
|
||||||
|
ordered.append((key, self._counter[key]))
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
def most_common(self, n=None):
|
||||||
|
return self._counter.most_common(n)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def as_string(self, occurences, label_length):
|
||||||
|
output = "\n"
|
||||||
|
for key, count in occurences:
|
||||||
|
key_str = str(key).ljust(label_length)[:label_length]
|
||||||
|
output += "%s %s\n" % (key_str, "*" * count)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return Histogram.as_string(self.in_order(), label_length=self.label_length)
|
||||||
|
@ -22,13 +22,26 @@ class EmitterTestCase(unittest.TestCase):
|
|||||||
self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n')
|
self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n')
|
||||||
|
|
||||||
def test_count_emitter(self):
|
def test_count_emitter(self):
|
||||||
ce = CountEmitter(every=10, outfile=self.output, format="%s records\n")
|
|
||||||
data = ce.attach([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])
|
# values for test
|
||||||
for _ in data:
|
values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
|
||||||
pass
|
|
||||||
|
# test without of parameter
|
||||||
|
ce = CountEmitter(every=10, outfile=self.output, format="%(count)s records\n")
|
||||||
|
list(ce.attach(values))
|
||||||
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n')
|
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n')
|
||||||
ce.done()
|
ce.done()
|
||||||
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n')
|
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n')
|
||||||
|
|
||||||
|
# reset output
|
||||||
|
self.output.truncate(0)
|
||||||
|
|
||||||
|
# test with of parameter
|
||||||
|
ce = CountEmitter(every=10, outfile=self.output, of=len(values))
|
||||||
|
list(ce.attach(values))
|
||||||
|
self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n')
|
||||||
|
ce.done()
|
||||||
|
self.assertEquals(self.output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n')
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -3,7 +3,7 @@ import operator
|
|||||||
import types
|
import types
|
||||||
from saucebrush.filters import (Filter, YieldFilter, FieldFilter,
|
from saucebrush.filters import (Filter, YieldFilter, FieldFilter,
|
||||||
SubrecordFilter, ConditionalPathFilter,
|
SubrecordFilter, ConditionalPathFilter,
|
||||||
ConditionalFilter, FieldModifier,
|
ConditionalFilter, FieldModifier, FieldKeeper,
|
||||||
FieldRemover, FieldMerger, FieldAdder,
|
FieldRemover, FieldMerger, FieldAdder,
|
||||||
FieldCopier, FieldRenamer, Unique)
|
FieldCopier, FieldRenamer, Unique)
|
||||||
|
|
||||||
@ -200,6 +200,13 @@ class FilterTestCase(unittest.TestCase):
|
|||||||
{'a':2, 'b':10, 'c':200}]
|
{'a':2, 'b':10, 'c':200}]
|
||||||
self.assert_filter_result(fm, expected_data)
|
self.assert_filter_result(fm, expected_data)
|
||||||
|
|
||||||
|
def test_field_keeper(self):
|
||||||
|
fk = FieldKeeper(['c'])
|
||||||
|
|
||||||
|
# check against expected results
|
||||||
|
expected_data = [{'c':3}, {'c':5}, {'c':100}]
|
||||||
|
self.assert_filter_result(fk, expected_data)
|
||||||
|
|
||||||
def test_field_remover(self):
|
def test_field_remover(self):
|
||||||
fr = FieldRemover(['a', 'b'])
|
fr = FieldRemover(['a', 'b'])
|
||||||
|
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import unittest
|
import unittest
|
||||||
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation
|
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram
|
||||||
|
|
||||||
class StatsTestCase(unittest.TestCase):
|
class StatsTestCase(unittest.TestCase):
|
||||||
|
|
||||||
@ -19,10 +19,16 @@ class StatsTestCase(unittest.TestCase):
|
|||||||
self.assertEqual(fltr.value(), 36.0)
|
self.assertEqual(fltr.value(), 36.0)
|
||||||
|
|
||||||
def test_median(self):
|
def test_median(self):
|
||||||
|
# odd number of values
|
||||||
fltr = Median('a')
|
fltr = Median('a')
|
||||||
list(fltr.attach(self._simple_data()))
|
list(fltr.attach(self._simple_data()))
|
||||||
self.assertEqual(fltr.value(), 1)
|
self.assertEqual(fltr.value(), 1)
|
||||||
|
|
||||||
|
# even number of values
|
||||||
|
fltr = Median('a')
|
||||||
|
list(fltr.attach(self._simple_data()[:2]))
|
||||||
|
self.assertEqual(fltr.value(), 3)
|
||||||
|
|
||||||
def test_minmax(self):
|
def test_minmax(self):
|
||||||
fltr = MinMax('b')
|
fltr = MinMax('b')
|
||||||
list(fltr.attach(self._simple_data()))
|
list(fltr.attach(self._simple_data()))
|
||||||
@ -36,5 +42,11 @@ class StatsTestCase(unittest.TestCase):
|
|||||||
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
|
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
|
||||||
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
|
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
|
||||||
|
|
||||||
|
def test_histogram(self):
|
||||||
|
fltr = Histogram('a')
|
||||||
|
fltr.label_length = 1
|
||||||
|
list(fltr.attach(self._simple_data()))
|
||||||
|
self.assertEqual(str(fltr), "\n1 **\n5 *\n")
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import os
|
||||||
import urllib2
|
import urllib2
|
||||||
"""
|
"""
|
||||||
General utilities used within saucebrush that may be useful elsewhere.
|
General utilities used within saucebrush that may be useful elsewhere.
|
||||||
@ -93,7 +94,6 @@ class Files(object):
|
|||||||
return self.linereader()
|
return self.linereader()
|
||||||
|
|
||||||
def linereader(self):
|
def linereader(self):
|
||||||
import os
|
|
||||||
for path in iter(self.paths):
|
for path in iter(self.paths):
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
if self.file_open_callback:
|
if self.file_open_callback:
|
||||||
|
Loading…
Reference in New Issue
Block a user