Merge branch 'master' of github.com:sunlightlabs/saucebrush
This commit is contained in:
commit
b3b491d927
177
saucebrush/stats.py
Normal file
177
saucebrush/stats.py
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
from saucebrush.filters import Filter
|
||||||
|
import itertools
|
||||||
|
import math
|
||||||
|
|
||||||
|
def _average(values):
|
||||||
|
""" Calculate the average of a list of values.
|
||||||
|
|
||||||
|
:param values: an iterable of ints or floats to average
|
||||||
|
"""
|
||||||
|
return sum(values) / float(len(values))
|
||||||
|
|
||||||
|
def _median(values):
|
||||||
|
""" Calculate the median of a list of values.
|
||||||
|
|
||||||
|
:param values: an iterable of ints or floats to calculate
|
||||||
|
"""
|
||||||
|
|
||||||
|
count = len(values)
|
||||||
|
|
||||||
|
# bail early before sorting if 0 or 1 values in list
|
||||||
|
if count == 0:
|
||||||
|
return None
|
||||||
|
elif count == 1:
|
||||||
|
return values[0]
|
||||||
|
|
||||||
|
values = sorted(values)
|
||||||
|
|
||||||
|
if count % 2 == 1:
|
||||||
|
# odd number of items, return middle value
|
||||||
|
return values[count / 2]
|
||||||
|
else:
|
||||||
|
# even number of items, return average of middle two items
|
||||||
|
mid = count / 2
|
||||||
|
return sum(values[mid - 1:mid + 1]) / 2.0
|
||||||
|
|
||||||
|
def _stddev(values, population=False):
|
||||||
|
""" Calculate the standard deviation and variance of a list of values.
|
||||||
|
|
||||||
|
:param values: an iterable of ints or floats to calculate
|
||||||
|
:param population: True if values represents entire population,
|
||||||
|
False if it is a sample of the population
|
||||||
|
"""
|
||||||
|
|
||||||
|
avg = _average(values)
|
||||||
|
count = len(values) if population else len(values) - 1
|
||||||
|
|
||||||
|
# square the difference between each value and the average
|
||||||
|
diffsq = ((i - avg) ** 2 for i in values)
|
||||||
|
|
||||||
|
# the average of the squared differences
|
||||||
|
variance = sum(diffsq) / float(count)
|
||||||
|
|
||||||
|
return (math.sqrt(variance), variance) # stddev is sqrt of variance
|
||||||
|
|
||||||
|
class StatsFilter(Filter):
|
||||||
|
""" Base for all stats filters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field, test=None):
|
||||||
|
self._field = field
|
||||||
|
self._test = test
|
||||||
|
|
||||||
|
def process_record(self, record):
|
||||||
|
if self._test is None or self._test(record):
|
||||||
|
self.process_field(record[self._field])
|
||||||
|
return record
|
||||||
|
|
||||||
|
def process_field(self, record):
|
||||||
|
raise NotImplementedError('process_field not defined in ' +
|
||||||
|
self.__class__.__name__)
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
raise NotImplementedError('value not defined in ' +
|
||||||
|
self.__class__.__name__)
|
||||||
|
|
||||||
|
class Sum(StatsFilter):
|
||||||
|
""" Calculate the sum of the values in a field. Field must contain either
|
||||||
|
int or float values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field, initial=0, **kwargs):
|
||||||
|
super(Sum, self).__init__(field, **kwargs)
|
||||||
|
self._value = initial
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
self._value += item or 0
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
return self._value
|
||||||
|
|
||||||
|
class Average(StatsFilter):
|
||||||
|
""" Calculate the average (mean) of the values in a field. Field must
|
||||||
|
contain either int or float values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field, initial=0, **kwargs):
|
||||||
|
super(Average, self).__init__(field, **kwargs)
|
||||||
|
self._value = initial
|
||||||
|
self._count = 0
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
if item is not None:
|
||||||
|
self._value += item
|
||||||
|
self._count += 1
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
return self._value / float(self._count)
|
||||||
|
|
||||||
|
class Median(StatsFilter):
|
||||||
|
""" Calculate the median of the values in a field. Field must contain
|
||||||
|
either int or float values.
|
||||||
|
|
||||||
|
**This filter keeps a list of field values in memory.**
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field, **kwargs):
|
||||||
|
super(Median, self).__init__(field, **kwargs)
|
||||||
|
self._values = []
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
if item is not None:
|
||||||
|
self._values.append(item)
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
return _median(self._values)
|
||||||
|
|
||||||
|
class MinMax(StatsFilter):
|
||||||
|
""" Find the minimum and maximum values in a field. Field must contain
|
||||||
|
either int or float values.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field, **kwargs):
|
||||||
|
super(MinMax, self).__init__(field, **kwargs)
|
||||||
|
self._max = None
|
||||||
|
self._min = None
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
if item is not None:
|
||||||
|
if self._max is None or item > self._max:
|
||||||
|
self._max = item
|
||||||
|
if self._min is None or item < self._min:
|
||||||
|
self._min = item
|
||||||
|
|
||||||
|
def value(self):
|
||||||
|
return (self._min, self._max)
|
||||||
|
|
||||||
|
class StandardDeviation(StatsFilter):
|
||||||
|
""" Calculate the standard deviation of the values in a field. Calling
|
||||||
|
value() will return a standard deviation for the sample. Pass
|
||||||
|
population=True to value() for the standard deviation of the
|
||||||
|
population. Convenience methods are provided for average() and
|
||||||
|
median(). Field must contain either int or float values.
|
||||||
|
|
||||||
|
**This filter keeps a list of field values in memory.**
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, field, **kwargs):
|
||||||
|
super(StandardDeviation, self).__init__(field, **kwargs)
|
||||||
|
self._values = []
|
||||||
|
|
||||||
|
def process_field(self, item):
|
||||||
|
if item is not None:
|
||||||
|
self._values.append(item)
|
||||||
|
|
||||||
|
def average(self):
|
||||||
|
return _average(self._values)
|
||||||
|
|
||||||
|
def median(self):
|
||||||
|
return _median(self._values)
|
||||||
|
|
||||||
|
def value(self, population=False):
|
||||||
|
""" Return a tuple of (standard_deviation, variance).
|
||||||
|
|
||||||
|
:param population: True if values represents entire population,
|
||||||
|
False if values is a sample. Default: False
|
||||||
|
"""
|
||||||
|
return _stddev(self._values, population)
|
@ -3,11 +3,13 @@ from saucebrush.tests.filters import FilterTestCase
|
|||||||
from saucebrush.tests.sources import SourceTestCase
|
from saucebrush.tests.sources import SourceTestCase
|
||||||
from saucebrush.tests.emitters import EmitterTestCase
|
from saucebrush.tests.emitters import EmitterTestCase
|
||||||
from saucebrush.tests.recipes import RecipeTestCase
|
from saucebrush.tests.recipes import RecipeTestCase
|
||||||
|
from saucebrush.tests.stats import StatsTestCase
|
||||||
|
|
||||||
filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase)
|
filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase)
|
||||||
source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase)
|
source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase)
|
||||||
emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase)
|
emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase)
|
||||||
recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase)
|
recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase)
|
||||||
|
stats_suite = unittest.TestLoader().loadTestsFromTestCase(StatsTestCase)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
40
saucebrush/tests/stats.py
Normal file
40
saucebrush/tests/stats.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import unittest
|
||||||
|
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation
|
||||||
|
|
||||||
|
class StatsTestCase(unittest.TestCase):
|
||||||
|
|
||||||
|
def _simple_data(self):
|
||||||
|
return [{'a':1, 'b':2, 'c':3},
|
||||||
|
{'a':5, 'b':5, 'c':5},
|
||||||
|
{'a':1, 'b':10, 'c':100}]
|
||||||
|
|
||||||
|
def test_sum(self):
|
||||||
|
fltr = Sum('b')
|
||||||
|
list(fltr.attach(self._simple_data()))
|
||||||
|
self.assertEqual(fltr.value(), 17)
|
||||||
|
|
||||||
|
def test_average(self):
|
||||||
|
fltr = Average('c')
|
||||||
|
list(fltr.attach(self._simple_data()))
|
||||||
|
self.assertEqual(fltr.value(), 36.0)
|
||||||
|
|
||||||
|
def test_median(self):
|
||||||
|
fltr = Median('a')
|
||||||
|
list(fltr.attach(self._simple_data()))
|
||||||
|
self.assertEqual(fltr.value(), 1)
|
||||||
|
|
||||||
|
def test_minmax(self):
|
||||||
|
fltr = MinMax('b')
|
||||||
|
list(fltr.attach(self._simple_data()))
|
||||||
|
self.assertEqual(fltr.value(), (2, 10))
|
||||||
|
|
||||||
|
def test_standard_deviation(self):
|
||||||
|
fltr = StandardDeviation('c')
|
||||||
|
list(fltr.attach(self._simple_data()))
|
||||||
|
self.assertEqual(fltr.average(), 36.0)
|
||||||
|
self.assertEqual(fltr.median(), 5)
|
||||||
|
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
|
||||||
|
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
@ -1,3 +1,5 @@
|
|||||||
|
import os
|
||||||
|
import urllib2
|
||||||
"""
|
"""
|
||||||
General utilities used within saucebrush that may be useful elsewhere.
|
General utilities used within saucebrush that may be useful elsewhere.
|
||||||
"""
|
"""
|
||||||
@ -72,6 +74,12 @@ def str_or_list(obj):
|
|||||||
#
|
#
|
||||||
|
|
||||||
class Files(object):
|
class Files(object):
|
||||||
|
""" Iterate over multiple files as a single file. Pass the paths of the
|
||||||
|
files as arguments to the class constructor:
|
||||||
|
|
||||||
|
for line in Files('/path/to/file/a', '/path/to/file/b'):
|
||||||
|
pass
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, *args):
|
def __init__(self, *args):
|
||||||
self.paths = []
|
self.paths = []
|
||||||
@ -86,7 +94,6 @@ class Files(object):
|
|||||||
return self.linereader()
|
return self.linereader()
|
||||||
|
|
||||||
def linereader(self):
|
def linereader(self):
|
||||||
import os
|
|
||||||
for path in iter(self.paths):
|
for path in iter(self.paths):
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
if self.file_open_callback:
|
if self.file_open_callback:
|
||||||
@ -95,3 +102,18 @@ class Files(object):
|
|||||||
for line in f:
|
for line in f:
|
||||||
yield line
|
yield line
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
class RemoteFile(object):
|
||||||
|
""" Stream data from a remote file.
|
||||||
|
|
||||||
|
:param url: URL to remote file
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, url):
|
||||||
|
self._url = url
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
resp = urllib2.urlopen(self._url)
|
||||||
|
for line in resp:
|
||||||
|
yield line.rstrip()
|
||||||
|
resp.close()
|
Loading…
Reference in New Issue
Block a user