diff --git a/saucebrush/stats.py b/saucebrush/stats.py new file mode 100644 index 0000000..b726035 --- /dev/null +++ b/saucebrush/stats.py @@ -0,0 +1,177 @@ +from saucebrush.filters import Filter +import itertools +import math + +def _average(values): + """ Calculate the average of a list of values. + + :param values: an iterable of ints or floats to average + """ + return sum(values) / float(len(values)) + +def _median(values): + """ Calculate the median of a list of values. + + :param values: an iterable of ints or floats to calculate + """ + + count = len(values) + + # bail early before sorting if 0 or 1 values in list + if count == 0: + return None + elif count == 1: + return values[0] + + values = sorted(values) + + if count % 2 == 1: + # odd number of items, return middle value + return values[count / 2] + else: + # even number of items, return average of middle two items + mid = count / 2 + return sum(values[mid - 1:mid + 1]) / 2.0 + +def _stddev(values, population=False): + """ Calculate the standard deviation and variance of a list of values. + + :param values: an iterable of ints or floats to calculate + :param population: True if values represents entire population, + False if it is a sample of the population + """ + + avg = _average(values) + count = len(values) if population else len(values) - 1 + + # square the difference between each value and the average + diffsq = ((i - avg) ** 2 for i in values) + + # the average of the squared differences + variance = sum(diffsq) / float(count) + + return (math.sqrt(variance), variance) # stddev is sqrt of variance + +class StatsFilter(Filter): + """ Base for all stats filters. + """ + + def __init__(self, field, test=None): + self._field = field + self._test = test + + def process_record(self, record): + if self._test is None or self._test(record): + self.process_field(record[self._field]) + return record + + def process_field(self, record): + raise NotImplementedError('process_field not defined in ' + + self.__class__.__name__) + + def value(self): + raise NotImplementedError('value not defined in ' + + self.__class__.__name__) + +class Sum(StatsFilter): + """ Calculate the sum of the values in a field. Field must contain either + int or float values. + """ + + def __init__(self, field, initial=0, **kwargs): + super(Sum, self).__init__(field, **kwargs) + self._value = initial + + def process_field(self, item): + self._value += item or 0 + + def value(self): + return self._value + +class Average(StatsFilter): + """ Calculate the average (mean) of the values in a field. Field must + contain either int or float values. + """ + + def __init__(self, field, initial=0, **kwargs): + super(Average, self).__init__(field, **kwargs) + self._value = initial + self._count = 0 + + def process_field(self, item): + if item is not None: + self._value += item + self._count += 1 + + def value(self): + return self._value / float(self._count) + +class Median(StatsFilter): + """ Calculate the median of the values in a field. Field must contain + either int or float values. + + **This filter keeps a list of field values in memory.** + """ + + def __init__(self, field, **kwargs): + super(Median, self).__init__(field, **kwargs) + self._values = [] + + def process_field(self, item): + if item is not None: + self._values.append(item) + + def value(self): + return _median(self._values) + +class MinMax(StatsFilter): + """ Find the minimum and maximum values in a field. Field must contain + either int or float values. + """ + + def __init__(self, field, **kwargs): + super(MinMax, self).__init__(field, **kwargs) + self._max = None + self._min = None + + def process_field(self, item): + if item is not None: + if self._max is None or item > self._max: + self._max = item + if self._min is None or item < self._min: + self._min = item + + def value(self): + return (self._min, self._max) + +class StandardDeviation(StatsFilter): + """ Calculate the standard deviation of the values in a field. Calling + value() will return a standard deviation for the sample. Pass + population=True to value() for the standard deviation of the + population. Convenience methods are provided for average() and + median(). Field must contain either int or float values. + + **This filter keeps a list of field values in memory.** + """ + + def __init__(self, field, **kwargs): + super(StandardDeviation, self).__init__(field, **kwargs) + self._values = [] + + def process_field(self, item): + if item is not None: + self._values.append(item) + + def average(self): + return _average(self._values) + + def median(self): + return _median(self._values) + + def value(self, population=False): + """ Return a tuple of (standard_deviation, variance). + + :param population: True if values represents entire population, + False if values is a sample. Default: False + """ + return _stddev(self._values, population) \ No newline at end of file diff --git a/saucebrush/tests/__init__.py b/saucebrush/tests/__init__.py index 9b8c511..4297ebb 100644 --- a/saucebrush/tests/__init__.py +++ b/saucebrush/tests/__init__.py @@ -3,11 +3,13 @@ from saucebrush.tests.filters import FilterTestCase from saucebrush.tests.sources import SourceTestCase from saucebrush.tests.emitters import EmitterTestCase from saucebrush.tests.recipes import RecipeTestCase +from saucebrush.tests.stats import StatsTestCase filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase) source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase) emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase) recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase) +stats_suite = unittest.TestLoader().loadTestsFromTestCase(StatsTestCase) if __name__ == '__main__': unittest.main() diff --git a/saucebrush/tests/stats.py b/saucebrush/tests/stats.py new file mode 100644 index 0000000..f529e07 --- /dev/null +++ b/saucebrush/tests/stats.py @@ -0,0 +1,40 @@ +import unittest +from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation + +class StatsTestCase(unittest.TestCase): + + def _simple_data(self): + return [{'a':1, 'b':2, 'c':3}, + {'a':5, 'b':5, 'c':5}, + {'a':1, 'b':10, 'c':100}] + + def test_sum(self): + fltr = Sum('b') + list(fltr.attach(self._simple_data())) + self.assertEqual(fltr.value(), 17) + + def test_average(self): + fltr = Average('c') + list(fltr.attach(self._simple_data())) + self.assertEqual(fltr.value(), 36.0) + + def test_median(self): + fltr = Median('a') + list(fltr.attach(self._simple_data())) + self.assertEqual(fltr.value(), 1) + + def test_minmax(self): + fltr = MinMax('b') + list(fltr.attach(self._simple_data())) + self.assertEqual(fltr.value(), (2, 10)) + + def test_standard_deviation(self): + fltr = StandardDeviation('c') + list(fltr.attach(self._simple_data())) + self.assertEqual(fltr.average(), 36.0) + self.assertEqual(fltr.median(), 5) + self.assertEqual(fltr.value(), (55.4346462061408, 3073.0)) + self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665)) + +if __name__ == '__main__': + unittest.main() diff --git a/saucebrush/utils.py b/saucebrush/utils.py index c21bcf3..aa8c241 100644 --- a/saucebrush/utils.py +++ b/saucebrush/utils.py @@ -1,3 +1,5 @@ +import os +import urllib2 """ General utilities used within saucebrush that may be useful elsewhere. """ @@ -72,6 +74,12 @@ def str_or_list(obj): # class Files(object): + """ Iterate over multiple files as a single file. Pass the paths of the + files as arguments to the class constructor: + + for line in Files('/path/to/file/a', '/path/to/file/b'): + pass + """ def __init__(self, *args): self.paths = [] @@ -86,7 +94,6 @@ class Files(object): return self.linereader() def linereader(self): - import os for path in iter(self.paths): if os.path.exists(path): if self.file_open_callback: @@ -95,3 +102,18 @@ class Files(object): for line in f: yield line f.close() + +class RemoteFile(object): + """ Stream data from a remote file. + + :param url: URL to remote file + """ + + def __init__(self, url): + self._url = url + + def __iter__(self): + resp = urllib2.urlopen(self._url) + for line in resp: + yield line.rstrip() + resp.close() \ No newline at end of file