Merge branch 'master' of github.com:sunlightlabs/saucebrush

This commit is contained in:
James Turk 2011-08-04 11:39:32 -04:00
commit b3b491d927
4 changed files with 242 additions and 1 deletions

177
saucebrush/stats.py Normal file
View File

@ -0,0 +1,177 @@
from saucebrush.filters import Filter
import itertools
import math
def _average(values):
""" Calculate the average of a list of values.
:param values: an iterable of ints or floats to average
"""
return sum(values) / float(len(values))
def _median(values):
""" Calculate the median of a list of values.
:param values: an iterable of ints or floats to calculate
"""
count = len(values)
# bail early before sorting if 0 or 1 values in list
if count == 0:
return None
elif count == 1:
return values[0]
values = sorted(values)
if count % 2 == 1:
# odd number of items, return middle value
return values[count / 2]
else:
# even number of items, return average of middle two items
mid = count / 2
return sum(values[mid - 1:mid + 1]) / 2.0
def _stddev(values, population=False):
""" Calculate the standard deviation and variance of a list of values.
:param values: an iterable of ints or floats to calculate
:param population: True if values represents entire population,
False if it is a sample of the population
"""
avg = _average(values)
count = len(values) if population else len(values) - 1
# square the difference between each value and the average
diffsq = ((i - avg) ** 2 for i in values)
# the average of the squared differences
variance = sum(diffsq) / float(count)
return (math.sqrt(variance), variance) # stddev is sqrt of variance
class StatsFilter(Filter):
""" Base for all stats filters.
"""
def __init__(self, field, test=None):
self._field = field
self._test = test
def process_record(self, record):
if self._test is None or self._test(record):
self.process_field(record[self._field])
return record
def process_field(self, record):
raise NotImplementedError('process_field not defined in ' +
self.__class__.__name__)
def value(self):
raise NotImplementedError('value not defined in ' +
self.__class__.__name__)
class Sum(StatsFilter):
""" Calculate the sum of the values in a field. Field must contain either
int or float values.
"""
def __init__(self, field, initial=0, **kwargs):
super(Sum, self).__init__(field, **kwargs)
self._value = initial
def process_field(self, item):
self._value += item or 0
def value(self):
return self._value
class Average(StatsFilter):
""" Calculate the average (mean) of the values in a field. Field must
contain either int or float values.
"""
def __init__(self, field, initial=0, **kwargs):
super(Average, self).__init__(field, **kwargs)
self._value = initial
self._count = 0
def process_field(self, item):
if item is not None:
self._value += item
self._count += 1
def value(self):
return self._value / float(self._count)
class Median(StatsFilter):
""" Calculate the median of the values in a field. Field must contain
either int or float values.
**This filter keeps a list of field values in memory.**
"""
def __init__(self, field, **kwargs):
super(Median, self).__init__(field, **kwargs)
self._values = []
def process_field(self, item):
if item is not None:
self._values.append(item)
def value(self):
return _median(self._values)
class MinMax(StatsFilter):
""" Find the minimum and maximum values in a field. Field must contain
either int or float values.
"""
def __init__(self, field, **kwargs):
super(MinMax, self).__init__(field, **kwargs)
self._max = None
self._min = None
def process_field(self, item):
if item is not None:
if self._max is None or item > self._max:
self._max = item
if self._min is None or item < self._min:
self._min = item
def value(self):
return (self._min, self._max)
class StandardDeviation(StatsFilter):
""" Calculate the standard deviation of the values in a field. Calling
value() will return a standard deviation for the sample. Pass
population=True to value() for the standard deviation of the
population. Convenience methods are provided for average() and
median(). Field must contain either int or float values.
**This filter keeps a list of field values in memory.**
"""
def __init__(self, field, **kwargs):
super(StandardDeviation, self).__init__(field, **kwargs)
self._values = []
def process_field(self, item):
if item is not None:
self._values.append(item)
def average(self):
return _average(self._values)
def median(self):
return _median(self._values)
def value(self, population=False):
""" Return a tuple of (standard_deviation, variance).
:param population: True if values represents entire population,
False if values is a sample. Default: False
"""
return _stddev(self._values, population)

View File

@ -3,11 +3,13 @@ from saucebrush.tests.filters import FilterTestCase
from saucebrush.tests.sources import SourceTestCase from saucebrush.tests.sources import SourceTestCase
from saucebrush.tests.emitters import EmitterTestCase from saucebrush.tests.emitters import EmitterTestCase
from saucebrush.tests.recipes import RecipeTestCase from saucebrush.tests.recipes import RecipeTestCase
from saucebrush.tests.stats import StatsTestCase
filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase) filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase)
source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase) source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase)
emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase) emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase)
recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase) recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase)
stats_suite = unittest.TestLoader().loadTestsFromTestCase(StatsTestCase)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

40
saucebrush/tests/stats.py Normal file
View File

@ -0,0 +1,40 @@
import unittest
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation
class StatsTestCase(unittest.TestCase):
def _simple_data(self):
return [{'a':1, 'b':2, 'c':3},
{'a':5, 'b':5, 'c':5},
{'a':1, 'b':10, 'c':100}]
def test_sum(self):
fltr = Sum('b')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 17)
def test_average(self):
fltr = Average('c')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 36.0)
def test_median(self):
fltr = Median('a')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 1)
def test_minmax(self):
fltr = MinMax('b')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), (2, 10))
def test_standard_deviation(self):
fltr = StandardDeviation('c')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.average(), 36.0)
self.assertEqual(fltr.median(), 5)
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
if __name__ == '__main__':
unittest.main()

View File

@ -1,3 +1,5 @@
import os
import urllib2
""" """
General utilities used within saucebrush that may be useful elsewhere. General utilities used within saucebrush that may be useful elsewhere.
""" """
@ -72,6 +74,12 @@ def str_or_list(obj):
# #
class Files(object): class Files(object):
""" Iterate over multiple files as a single file. Pass the paths of the
files as arguments to the class constructor:
for line in Files('/path/to/file/a', '/path/to/file/b'):
pass
"""
def __init__(self, *args): def __init__(self, *args):
self.paths = [] self.paths = []
@ -86,7 +94,6 @@ class Files(object):
return self.linereader() return self.linereader()
def linereader(self): def linereader(self):
import os
for path in iter(self.paths): for path in iter(self.paths):
if os.path.exists(path): if os.path.exists(path):
if self.file_open_callback: if self.file_open_callback:
@ -95,3 +102,18 @@ class Files(object):
for line in f: for line in f:
yield line yield line
f.close() f.close()
class RemoteFile(object):
""" Stream data from a remote file.
:param url: URL to remote file
"""
def __init__(self, url):
self._url = url
def __iter__(self):
resp = urllib2.urlopen(self._url)
for line in resp:
yield line.rstrip()
resp.close()