Merge pull request #1 from jamesturk/modernize

Modernize
This commit is contained in:
James Turk 2022-11-10 23:14:42 -05:00 committed by GitHub
commit 40008d7b53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1658 additions and 1088 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@ -0,0 +1 @@
github: [jamesturk]

17
.github/ISSUE_TEMPLATE/bug_report.md vendored Normal file
View File

@ -0,0 +1,17 @@
---
name: Bug report
about: Create a report to help us improve
title: ""
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**Environment**
Please provide output of `python -V` & `spatula --version`, as well as what operating system you're using, and any other details:
**Additional context**
Add any other context about the problem here.

View File

@ -0,0 +1,20 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context about the feature request here.

36
.github/workflows/test.yml vendored Normal file
View File

@ -0,0 +1,36 @@
name: Test & Lint
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
max-parallel: 4
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
steps:
# Python & dependency installation
- uses: actions/checkout@v3
- name: setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: install Poetry
uses: snok/install-poetry@v1.2.1
- name: set poetry config path
run: poetry config virtualenvs.path ~/.virtualenvs
- name: install dependencies
run: poetry install
# - name: lint with mypy
# run: poetry run mypy src
- name: lint with flake8
run: poetry run flake8 --show-source --statistics --ignore=E203,E501,W503 src
- name: pytest
run: poetry run pytest

View File

@ -1,9 +0,0 @@
language: python
python:
- "2.7"
- "3.5"
install: pip install nose
script: nosetests
notifications:
email:
- james.p.turk@gmail.com

0
README.md Normal file
View File

View File

@ -1,7 +1,7 @@
import re import re
import exceptions import exceptions
class FECSource(object): class FECSource:
SPLIT_CHAR = '\x1c' SPLIT_CHAR = '\x1c'
FORM_FIELDS = { FORM_FIELDS = {

395
poetry.lock generated Normal file
View File

@ -0,0 +1,395 @@
[[package]]
name = "attrs"
version = "22.1.0"
description = "Classes Without Boilerplate"
category = "dev"
optional = false
python-versions = ">=3.5"
[package.extras]
dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy (>=0.900,!=0.940)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "sphinx", "sphinx-notfound-page", "zope.interface"]
docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"]
tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"]
[[package]]
name = "black"
version = "22.10.0"
description = "The uncompromising code formatter."
category = "dev"
optional = false
python-versions = ">=3.7"
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "click"
version = "8.1.3"
description = "Composable command line interface toolkit"
category = "dev"
optional = false
python-versions = ">=3.7"
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "dev"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
[[package]]
name = "cssselect"
version = "1.2.0"
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
category = "main"
optional = false
python-versions = ">=3.7"
[[package]]
name = "exceptiongroup"
version = "1.0.1"
description = "Backport of PEP 654 (exception groups)"
category = "dev"
optional = false
python-versions = ">=3.7"
[package.extras]
test = ["pytest (>=6)"]
[[package]]
name = "flake8"
version = "5.0.4"
description = "the modular source code checker: pep8 pyflakes and co"
category = "dev"
optional = false
python-versions = ">=3.6.1"
[package.dependencies]
mccabe = ">=0.7.0,<0.8.0"
pycodestyle = ">=2.9.0,<2.10.0"
pyflakes = ">=2.5.0,<2.6.0"
[[package]]
name = "iniconfig"
version = "1.1.1"
description = "iniconfig: brain-dead simple config-ini parsing"
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "lxml"
version = "4.9.1"
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
[package.extras]
cssselect = ["cssselect (>=0.7)"]
html5 = ["html5lib"]
htmlsoup = ["BeautifulSoup4"]
source = ["Cython (>=0.29.7)"]
[[package]]
name = "mccabe"
version = "0.7.0"
description = "McCabe checker, plugin for flake8"
category = "dev"
optional = false
python-versions = ">=3.6"
[[package]]
name = "mypy-extensions"
version = "0.4.3"
description = "Experimental type system extensions for programs checked with the mypy typechecker."
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "packaging"
version = "21.3"
description = "Core utilities for Python packages"
category = "dev"
optional = false
python-versions = ">=3.6"
[package.dependencies]
pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
[[package]]
name = "pathspec"
version = "0.10.1"
description = "Utility library for gitignore style pattern matching of file paths."
category = "dev"
optional = false
python-versions = ">=3.7"
[[package]]
name = "platformdirs"
version = "2.5.3"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "dev"
optional = false
python-versions = ">=3.7"
[package.extras]
docs = ["furo (>=2022.9.29)", "proselint (>=0.13)", "sphinx (>=5.3)", "sphinx-autodoc-typehints (>=1.19.4)"]
test = ["appdirs (==1.4.4)", "pytest (>=7.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
[[package]]
name = "pluggy"
version = "1.0.0"
description = "plugin and hook calling mechanisms for python"
category = "dev"
optional = false
python-versions = ">=3.6"
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pycodestyle"
version = "2.9.1"
description = "Python style guide checker"
category = "dev"
optional = false
python-versions = ">=3.6"
[[package]]
name = "pyflakes"
version = "2.5.0"
description = "passive checker of Python programs"
category = "dev"
optional = false
python-versions = ">=3.6"
[[package]]
name = "pyparsing"
version = "3.0.9"
description = "pyparsing module - Classes and methods to define and execute parsing grammars"
category = "dev"
optional = false
python-versions = ">=3.6.8"
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "pytest"
version = "7.2.0"
description = "pytest: simple powerful testing with Python"
category = "dev"
optional = false
python-versions = ">=3.7"
[package.dependencies]
attrs = ">=19.2.0"
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
[[package]]
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
category = "dev"
optional = false
python-versions = ">=3.7"
[metadata]
lock-version = "1.1"
python-versions = "^3.10"
content-hash = "765977e700b56e9b852f6ca6f5d54e2c1343b3a07b9220e83ef969a277f67866"
[metadata.files]
attrs = [
{file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"},
{file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"},
]
black = [
{file = "black-22.10.0-1fixedarch-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5cc42ca67989e9c3cf859e84c2bf014f6633db63d1cbdf8fdb666dcd9e77e3fa"},
{file = "black-22.10.0-1fixedarch-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5d8f74030e67087b219b032aa33a919fae8806d49c867846bfacde57f43972ef"},
{file = "black-22.10.0-1fixedarch-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:197df8509263b0b8614e1df1756b1dd41be6738eed2ba9e9769f3880c2b9d7b6"},
{file = "black-22.10.0-1fixedarch-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:2644b5d63633702bc2c5f3754b1b475378fbbfb481f62319388235d0cd104c2d"},
{file = "black-22.10.0-1fixedarch-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:e41a86c6c650bcecc6633ee3180d80a025db041a8e2398dcc059b3afa8382cd4"},
{file = "black-22.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2039230db3c6c639bd84efe3292ec7b06e9214a2992cd9beb293d639c6402edb"},
{file = "black-22.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ff67aec0a47c424bc99b71005202045dc09270da44a27848d534600ac64fc7"},
{file = "black-22.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:819dc789f4498ecc91438a7de64427c73b45035e2e3680c92e18795a839ebb66"},
{file = "black-22.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b9b29da4f564ba8787c119f37d174f2b69cdfdf9015b7d8c5c16121ddc054ae"},
{file = "black-22.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b49776299fece66bffaafe357d929ca9451450f5466e997a7285ab0fe28e3b"},
{file = "black-22.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:21199526696b8f09c3997e2b4db8d0b108d801a348414264d2eb8eb2532e540d"},
{file = "black-22.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e464456d24e23d11fced2bc8c47ef66d471f845c7b7a42f3bd77bf3d1789650"},
{file = "black-22.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9311e99228ae10023300ecac05be5a296f60d2fd10fff31cf5c1fa4ca4b1988d"},
{file = "black-22.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fba8a281e570adafb79f7755ac8721b6cf1bbf691186a287e990c7929c7692ff"},
{file = "black-22.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:915ace4ff03fdfff953962fa672d44be269deb2eaf88499a0f8805221bc68c87"},
{file = "black-22.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:444ebfb4e441254e87bad00c661fe32df9969b2bf224373a448d8aca2132b395"},
{file = "black-22.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:974308c58d057a651d182208a484ce80a26dac0caef2895836a92dd6ebd725e0"},
{file = "black-22.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72ef3925f30e12a184889aac03d77d031056860ccae8a1e519f6cbb742736383"},
{file = "black-22.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:432247333090c8c5366e69627ccb363bc58514ae3e63f7fc75c54b1ea80fa7de"},
{file = "black-22.10.0-py3-none-any.whl", hash = "sha256:c957b2b4ea88587b46cf49d1dc17681c1e672864fd7af32fc1e9664d572b3458"},
{file = "black-22.10.0.tar.gz", hash = "sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1"},
]
click = [
{file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
{file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
]
colorama = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
cssselect = [
{file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"},
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
]
exceptiongroup = [
{file = "exceptiongroup-1.0.1-py3-none-any.whl", hash = "sha256:4d6c0aa6dd825810941c792f53d7b8d71da26f5e5f84f20f9508e8f2d33b140a"},
{file = "exceptiongroup-1.0.1.tar.gz", hash = "sha256:73866f7f842ede6cb1daa42c4af078e2035e5f7607f0e2c762cc51bb31bbe7b2"},
]
flake8 = [
{file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
{file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
]
iniconfig = [
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
]
lxml = [
{file = "lxml-4.9.1-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed"},
{file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc"},
{file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc"},
{file = "lxml-4.9.1-cp27-cp27m-win32.whl", hash = "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3"},
{file = "lxml-4.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627"},
{file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84"},
{file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837"},
{file = "lxml-4.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad"},
{file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5"},
{file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8"},
{file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8"},
{file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d"},
{file = "lxml-4.9.1-cp310-cp310-win32.whl", hash = "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7"},
{file = "lxml-4.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b"},
{file = "lxml-4.9.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d"},
{file = "lxml-4.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3"},
{file = "lxml-4.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29"},
{file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d"},
{file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318"},
{file = "lxml-4.9.1-cp35-cp35m-win32.whl", hash = "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7"},
{file = "lxml-4.9.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4"},
{file = "lxml-4.9.1-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf"},
{file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3"},
{file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391"},
{file = "lxml-4.9.1-cp36-cp36m-win32.whl", hash = "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e"},
{file = "lxml-4.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7"},
{file = "lxml-4.9.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca"},
{file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785"},
{file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785"},
{file = "lxml-4.9.1-cp37-cp37m-win32.whl", hash = "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a"},
{file = "lxml-4.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e"},
{file = "lxml-4.9.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715"},
{file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036"},
{file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387"},
{file = "lxml-4.9.1-cp38-cp38-win32.whl", hash = "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94"},
{file = "lxml-4.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345"},
{file = "lxml-4.9.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000"},
{file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25"},
{file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd"},
{file = "lxml-4.9.1-cp39-cp39-win32.whl", hash = "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb"},
{file = "lxml-4.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d"},
{file = "lxml-4.9.1-pp37-pypy37_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c"},
{file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b"},
{file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc"},
{file = "lxml-4.9.1-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b"},
{file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2"},
{file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73"},
{file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c"},
{file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9"},
{file = "lxml-4.9.1.tar.gz", hash = "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"},
]
mccabe = [
{file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
{file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
]
mypy-extensions = [
{file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
{file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
]
packaging = [
{file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
{file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
]
pathspec = [
{file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"},
{file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"},
]
platformdirs = [
{file = "platformdirs-2.5.3-py3-none-any.whl", hash = "sha256:0cb405749187a194f444c25c82ef7225232f11564721eabffc6ec70df83b11cb"},
{file = "platformdirs-2.5.3.tar.gz", hash = "sha256:6e52c21afff35cb659c6e52d8b4d61b9bd544557180440538f255d9382c8cbe0"},
]
pluggy = [
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
]
pycodestyle = [
{file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
{file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
]
pyflakes = [
{file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
{file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
]
pyparsing = [
{file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
{file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
]
pytest = [
{file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
{file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"},
]
tomli = [
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]

22
pyproject.toml Normal file
View File

@ -0,0 +1,22 @@
[tool.poetry]
name = "saucebrush"
version = "0.6.0"
description = ""
authors = ["James Turk <dev@jamesturk.net>"]
license = "MIT"
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
lxml = "^4.9.1"
cssselect = "^1.2.0"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.0"
flake8 = "^5.0.4"
black = "^22.10.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

View File

@ -1,15 +0,0 @@
import unittest
from saucebrush.tests.filters import FilterTestCase
from saucebrush.tests.sources import SourceTestCase
from saucebrush.tests.emitters import EmitterTestCase
from saucebrush.tests.recipes import RecipeTestCase
from saucebrush.tests.stats import StatsTestCase
filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase)
source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase)
emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase)
recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase)
stats_suite = unittest.TestLoader().loadTestsFromTestCase(StatsTestCase)
if __name__ == '__main__':
unittest.main()

View File

@ -1,86 +0,0 @@
from __future__ import unicode_literals
from contextlib import closing
from io import StringIO
import os
import unittest
from saucebrush.emitters import (
DebugEmitter, CSVEmitter, CountEmitter, SqliteEmitter, SqlDumpEmitter)
class EmitterTestCase(unittest.TestCase):
def test_debug_emitter(self):
with closing(StringIO()) as output:
de = DebugEmitter(output)
list(de.attach([1,2,3]))
self.assertEqual(output.getvalue(), '1\n2\n3\n')
def test_count_emitter(self):
# values for test
values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
with closing(StringIO()) as output:
# test without of parameter
ce = CountEmitter(every=10, outfile=output, format="%(count)s records\n")
list(ce.attach(values))
self.assertEqual(output.getvalue(), '10 records\n20 records\n')
ce.done()
self.assertEqual(output.getvalue(), '10 records\n20 records\n22 records\n')
with closing(StringIO()) as output:
# test with of parameter
ce = CountEmitter(every=10, outfile=output, of=len(values))
list(ce.attach(values))
self.assertEqual(output.getvalue(), '10 of 22\n20 of 22\n')
ce.done()
self.assertEqual(output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n')
def test_csv_emitter(self):
try:
import cStringIO # if Python 2.x then use old cStringIO
io = cStringIO.StringIO()
except:
io = StringIO() # if Python 3.x then use StringIO
with closing(io) as output:
ce = CSVEmitter(output, ('x','y','z'))
list(ce.attach([{'x':1, 'y':2, 'z':3}, {'x':5, 'y':5, 'z':5}]))
self.assertEqual(output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n')
def test_sqlite_emitter(self):
import sqlite3, tempfile
with closing(tempfile.NamedTemporaryFile(suffix='.db')) as f:
db_path = f.name
sle = SqliteEmitter(db_path, 'testtable', fieldnames=('a','b','c'))
list(sle.attach([{'a': '1', 'b': '2', 'c': '3'}]))
sle.done()
with closing(sqlite3.connect(db_path)) as conn:
cur = conn.cursor()
cur.execute("""SELECT a, b, c FROM testtable""")
results = cur.fetchall()
os.unlink(db_path)
self.assertEqual(results, [('1', '2', '3')])
def test_sql_dump_emitter(self):
with closing(StringIO()) as bffr:
sde = SqlDumpEmitter(bffr, 'testtable', ('a', 'b'))
list(sde.attach([{'a': 1, 'b': '2'}]))
sde.done()
self.assertEqual(bffr.getvalue(), "INSERT INTO `testtable` (`a`,`b`) VALUES (1,'2');\n")
if __name__ == '__main__':
unittest.main()

View File

@ -1,304 +0,0 @@
import unittest
import operator
import types
from saucebrush.filters import (Filter, YieldFilter, FieldFilter,
SubrecordFilter, ConditionalPathFilter,
ConditionalFilter, FieldModifier, FieldKeeper,
FieldRemover, FieldMerger, FieldAdder,
FieldCopier, FieldRenamer, Unique)
class DummyRecipe(object):
rejected_record = None
rejected_msg = None
def reject_record(self, record, msg):
self.rejected_record = record
self.rejected_msg = msg
class Doubler(Filter):
def process_record(self, record):
return record*2
class OddRemover(Filter):
def process_record(self, record):
if record % 2 == 0:
return record
else:
return None # explicitly return None
class ListFlattener(YieldFilter):
def process_record(self, record):
for item in record:
yield item
class FieldDoubler(FieldFilter):
def process_field(self, item):
return item*2
class NonModifyingFieldDoubler(Filter):
def __init__(self, key):
self.key = key
def process_record(self, record):
record = dict(record)
record[self.key] *= 2
return record
class ConditionalOddRemover(ConditionalFilter):
def test_record(self, record):
# return True for even values
return record % 2 == 0
class FilterTestCase(unittest.TestCase):
def _simple_data(self):
return [{'a':1, 'b':2, 'c':3},
{'a':5, 'b':5, 'c':5},
{'a':1, 'b':10, 'c':100}]
def assert_filter_result(self, filter_obj, expected_data):
result = filter_obj.attach(self._simple_data())
self.assertEqual(list(result), expected_data)
def test_reject_record(self):
recipe = DummyRecipe()
f = Doubler()
result = f.attach([1,2,3], recipe=recipe)
# next has to be called for attach to take effect
next(result)
f.reject_record('bad', 'this one was bad')
# ensure that the rejection propagated to the recipe
self.assertEqual('bad', recipe.rejected_record)
self.assertEqual('this one was bad', recipe.rejected_msg)
def test_simple_filter(self):
df = Doubler()
result = df.attach([1,2,3])
# ensure we got a generator that yields 2,4,6
self.assertEqual(type(result), types.GeneratorType)
self.assertEqual(list(result), [2,4,6])
def test_simple_filter_return_none(self):
cf = OddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEqual(list(result), [0,2,4,6,8])
def test_simple_yield_filter(self):
lf = ListFlattener()
result = lf.attach([[1],[2,3],[4,5,6]])
# ensure we got a generator that yields 1,2,3,4,5,6
self.assertEqual(type(result), types.GeneratorType)
self.assertEqual(list(result), [1,2,3,4,5,6])
def test_simple_field_filter(self):
ff = FieldDoubler(['a', 'c'])
# check against expected data
expected_data = [{'a':2, 'b':2, 'c':6},
{'a':10, 'b':5, 'c':10},
{'a':2, 'b':10, 'c':200}]
self.assert_filter_result(ff, expected_data)
def test_conditional_filter(self):
cf = ConditionalOddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEqual(list(result), [0,2,4,6,8])
### Tests for Subrecord
def test_subrecord_filter_list(self):
data = [{'a': [{'b': 2}, {'b': 4}]},
{'a': [{'b': 5}]},
{'a': [{'b': 8}, {'b':2}, {'b':1}]}]
expected = [{'a': [{'b': 4}, {'b': 8}]},
{'a': [{'b': 10}]},
{'a': [{'b': 16}, {'b':4}, {'b':2}]}]
sf = SubrecordFilter('a', NonModifyingFieldDoubler('b'))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_deep(self):
data = [{'a': {'d':[{'b': 2}, {'b': 4}]}},
{'a': {'d':[{'b': 5}]}},
{'a': {'d':[{'b': 8}, {'b':2}, {'b':1}]}}]
expected = [{'a': {'d':[{'b': 4}, {'b': 8}]}},
{'a': {'d':[{'b': 10}]}},
{'a': {'d':[{'b': 16}, {'b':4}, {'b':2}]}}]
sf = SubrecordFilter('a.d', NonModifyingFieldDoubler('b'))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_nonlist(self):
data = [
{'a':{'b':{'c':1}}},
{'a':{'b':{'c':2}}},
{'a':{'b':{'c':3}}},
]
expected = [
{'a':{'b':{'c':2}}},
{'a':{'b':{'c':4}}},
{'a':{'b':{'c':6}}},
]
sf = SubrecordFilter('a.b', NonModifyingFieldDoubler('c'))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_list_in_path(self):
data = [
{'a': [{'b': {'c': 5}}, {'b': {'c': 6}}]},
{'a': [{'b': {'c': 1}}, {'b': {'c': 2}}, {'b': {'c': 3}}]},
{'a': [{'b': {'c': 2}} ]}
]
expected = [
{'a': [{'b': {'c': 10}}, {'b': {'c': 12}}]},
{'a': [{'b': {'c': 2}}, {'b': {'c': 4}}, {'b': {'c': 6}}]},
{'a': [{'b': {'c': 4}} ]}
]
sf = SubrecordFilter('a.b', NonModifyingFieldDoubler('c'))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_conditional_path(self):
predicate = lambda r: r['a'] == 1
# double b if a == 1, otherwise double c
cpf = ConditionalPathFilter(predicate, FieldDoubler('b'),
FieldDoubler('c'))
expected_data = [{'a':1, 'b':4, 'c':3},
{'a':5, 'b':5, 'c':10},
{'a':1, 'b':20, 'c':100}]
self.assert_filter_result(cpf, expected_data)
### Tests for Generic Filters
def test_field_modifier(self):
# another version of FieldDoubler
fm = FieldModifier(['a', 'c'], lambda x: x*2)
# check against expected data
expected_data = [{'a':2, 'b':2, 'c':6},
{'a':10, 'b':5, 'c':10},
{'a':2, 'b':10, 'c':200}]
self.assert_filter_result(fm, expected_data)
def test_field_keeper(self):
fk = FieldKeeper(['c'])
# check against expected results
expected_data = [{'c':3}, {'c':5}, {'c':100}]
self.assert_filter_result(fk, expected_data)
def test_field_remover(self):
fr = FieldRemover(['a', 'b'])
# check against expected results
expected_data = [{'c':3}, {'c':5}, {'c':100}]
self.assert_filter_result(fr, expected_data)
def test_field_merger(self):
fm = FieldMerger({'sum':('a','b','c')}, lambda x,y,z: x+y+z)
# check against expected results
expected_data = [{'sum':6}, {'sum':15}, {'sum':111}]
self.assert_filter_result(fm, expected_data)
def test_field_merger_keep_fields(self):
fm = FieldMerger({'sum':('a','b','c')}, lambda x,y,z: x+y+z,
keep_fields=True)
# check against expected results
expected_data = [{'a':1, 'b':2, 'c':3, 'sum':6},
{'a':5, 'b':5, 'c':5, 'sum':15},
{'a':1, 'b':10, 'c':100, 'sum': 111}]
self.assert_filter_result(fm, expected_data)
def test_field_adder_scalar(self):
fa = FieldAdder('x', 7)
expected_data = [{'a':1, 'b':2, 'c':3, 'x':7},
{'a':5, 'b':5, 'c':5, 'x':7},
{'a':1, 'b':10, 'c':100, 'x': 7}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_callable(self):
fa = FieldAdder('x', lambda: 7)
expected_data = [{'a':1, 'b':2, 'c':3, 'x':7},
{'a':5, 'b':5, 'c':5, 'x':7},
{'a':1, 'b':10, 'c':100, 'x': 7}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_iterable(self):
fa = FieldAdder('x', [1,2,3])
expected_data = [{'a':1, 'b':2, 'c':3, 'x':1},
{'a':5, 'b':5, 'c':5, 'x':2},
{'a':1, 'b':10, 'c':100, 'x': 3}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_replace(self):
fa = FieldAdder('b', lambda: 7)
expected_data = [{'a':1, 'b':7, 'c':3},
{'a':5, 'b':7, 'c':5},
{'a':1, 'b':7, 'c':100}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_no_replace(self):
fa = FieldAdder('b', lambda: 7, replace=False)
expected_data = [{'a':1, 'b':2, 'c':3},
{'a':5, 'b':5, 'c':5},
{'a':1, 'b':10, 'c':100}]
self.assert_filter_result(fa, expected_data)
def test_field_copier(self):
fc = FieldCopier({'a2':'a', 'b2':'b'})
expected_data = [{'a':1, 'b':2, 'c':3, 'a2':1, 'b2':2},
{'a':5, 'b':5, 'c':5, 'a2':5, 'b2':5},
{'a':1, 'b':10, 'c':100, 'a2': 1, 'b2': 10}]
self.assert_filter_result(fc, expected_data)
def test_field_renamer(self):
fr = FieldRenamer({'x':'a', 'y':'b'})
expected_data = [{'x':1, 'y':2, 'c':3},
{'x':5, 'y':5, 'c':5},
{'x':1, 'y':10, 'c':100}]
self.assert_filter_result(fr, expected_data)
# TODO: splitter & flattner tests?
def test_unique_filter(self):
u = Unique()
in_data = [{'a': 77}, {'a':33}, {'a': 77}]
expected_data = [{'a': 77}, {'a':33}]
result = u.attach(in_data)
self.assertEqual(list(result), expected_data)
# TODO: unicode & string filter tests
if __name__ == '__main__':
unittest.main()

View File

@ -1,53 +0,0 @@
import doctest
import unittest
from saucebrush import Recipe, run_recipe, SaucebrushError, OvercookedError
from saucebrush.filters import Filter
class Raiser(Filter):
def process_record(self, record):
raise Exception("bad record")
class Saver(Filter):
def __init__(self):
self.saved = []
def process_record(self, record):
self.saved.append(record)
return record
class RecipeTestCase(unittest.TestCase):
def test_error_stream(self):
saver = Saver()
recipe = Recipe(Raiser(), error_stream=saver)
recipe.run([{'a': 1}, {'b': 2}])
recipe.done()
self.assertEqual(saver.saved[0]['record'], {'a': 1})
self.assertEqual(saver.saved[1]['record'], {'b': 2})
# Must pass either a Recipe, a Filter or an iterable of Filters
# as the error_stream argument
self.assertRaises(SaucebrushError, Recipe, error_stream=5)
def test_run_recipe(self):
saver = Saver()
run_recipe([1, 2], saver)
self.assertEqual(saver.saved, [1, 2])
def test_done(self):
saver = Saver()
recipe = Recipe(saver)
recipe.run([1])
recipe.done()
self.assertRaises(OvercookedError, recipe.run, [2])
self.assertRaises(OvercookedError, recipe.done)
self.assertEqual(saver.saved, [1])
if __name__ == '__main__':
unittest.main()

View File

@ -1,87 +0,0 @@
from __future__ import unicode_literals
from io import BytesIO, StringIO
import unittest
from saucebrush.sources import (
CSVSource, FixedWidthFileSource, HtmlTableSource, JSONSource)
class SourceTestCase(unittest.TestCase):
def _get_csv(self):
data = '''a,b,c
1,2,3
5,5,5
1,10,100'''
return StringIO(data)
def test_csv_source_basic(self):
source = CSVSource(self._get_csv())
expected_data = [{'a':'1', 'b':'2', 'c':'3'},
{'a':'5', 'b':'5', 'c':'5'},
{'a':'1', 'b':'10', 'c':'100'}]
self.assertEqual(list(source), expected_data)
def test_csv_source_fieldnames(self):
source = CSVSource(self._get_csv(), ['x','y','z'])
expected_data = [{'x':'a', 'y':'b', 'z':'c'},
{'x':'1', 'y':'2', 'z':'3'},
{'x':'5', 'y':'5', 'z':'5'},
{'x':'1', 'y':'10', 'z':'100'}]
self.assertEqual(list(source), expected_data)
def test_csv_source_skiprows(self):
source = CSVSource(self._get_csv(), skiprows=1)
expected_data = [{'a':'5', 'b':'5', 'c':'5'},
{'a':'1', 'b':'10', 'c':'100'}]
self.assertEqual(list(source), expected_data)
def test_fixed_width_source(self):
data = StringIO('JamesNovember 3 1986\nTim September151999')
fields = (('name',5), ('month',9), ('day',2), ('year',4))
source = FixedWidthFileSource(data, fields)
expected_data = [{'name':'James', 'month':'November', 'day':'3',
'year':'1986'},
{'name':'Tim', 'month':'September', 'day':'15',
'year':'1999'}]
self.assertEqual(list(source), expected_data)
def test_json_source(self):
content = StringIO("""[{"a": 1, "b": "2", "c": 3}]""")
js = JSONSource(content)
self.assertEqual(list(js), [{'a': 1, 'b': '2', 'c': 3}])
def test_html_table_source(self):
content = StringIO("""
<html>
<table id="thetable">
<tr>
<th>a</th>
<th>b</th>
<th>c</th>
</tr>
<tr>
<td>1</td>
<td>2</td>
<td>3</td>
</tr>
</table>
</html>
""")
try:
import lxml
hts = HtmlTableSource(content, 'thetable')
self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}])
except ImportError:
# Python 2.6 doesn't have skipTest. We'll just suffer without it.
if hasattr(self, 'skipTest'):
self.skipTest("lxml is not installed")
if __name__ == '__main__':
unittest.main()

View File

@ -1,52 +0,0 @@
import unittest
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram
class StatsTestCase(unittest.TestCase):
def _simple_data(self):
return [{'a':1, 'b':2, 'c':3},
{'a':5, 'b':5, 'c':5},
{'a':1, 'b':10, 'c':100}]
def test_sum(self):
fltr = Sum('b')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 17)
def test_average(self):
fltr = Average('c')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 36.0)
def test_median(self):
# odd number of values
fltr = Median('a')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), 1)
# even number of values
fltr = Median('a')
list(fltr.attach(self._simple_data()[:2]))
self.assertEqual(fltr.value(), 3)
def test_minmax(self):
fltr = MinMax('b')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.value(), (2, 10))
def test_standard_deviation(self):
fltr = StandardDeviation('c')
list(fltr.attach(self._simple_data()))
self.assertEqual(fltr.average(), 36.0)
self.assertEqual(fltr.median(), 5)
self.assertEqual(fltr.value(), (55.4346462061408, 3073.0))
self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665))
def test_histogram(self):
fltr = Histogram('a')
fltr.label_length = 1
list(fltr.attach(self._simple_data()))
self.assertEqual(str(fltr), "\n1 **\n5 *\n")
if __name__ == '__main__':
unittest.main()

View File

@ -1,7 +0,0 @@
#!/usr/bin/env python
from setuptools import setup
setup(name="saucebrush",
version='0.5.0-dev',
packages=['saucebrush'],
)

View File

@ -2,7 +2,7 @@
Saucebrush is a data loading & manipulation framework written in python. Saucebrush is a data loading & manipulation framework written in python.
""" """
from . import filters, emitters, sources, utils from . import filters, emitters, sources, utils # noqa
class SaucebrushError(Exception): class SaucebrushError(Exception):
@ -13,39 +13,39 @@ class OvercookedError(Exception):
""" """
Exception for trying to operate on a Recipe that has been finished. Exception for trying to operate on a Recipe that has been finished.
""" """
pass pass
class Recipe(object): class Recipe:
def __init__(self, *filter_args, **kwargs): def __init__(self, *filter_args, **kwargs):
self.finished = False self.finished = False
self.filters = [] self.filters = []
for filter in filter_args: for filter in filter_args:
if hasattr(filter, 'filters'): if hasattr(filter, "filters"):
self.filters.extend(filter.filters) self.filters.extend(filter.filters)
else: else:
self.filters.append(filter) self.filters.append(filter)
self.error_stream = kwargs.get('error_stream') self.error_stream = kwargs.get("error_stream")
if self.error_stream and not isinstance(self.error_stream, Recipe): if self.error_stream and not isinstance(self.error_stream, Recipe):
if isinstance(self.error_stream, filters.Filter): if isinstance(self.error_stream, filters.Filter):
self.error_stream = Recipe(self.error_stream) self.error_stream = Recipe(self.error_stream)
elif hasattr(self.error_stream, '__iter__'): elif hasattr(self.error_stream, "__iter__"):
self.error_stream = Recipe(*self.error_stream) self.error_stream = Recipe(*self.error_stream)
else: else:
raise SaucebrushError('error_stream must be either a filter' raise SaucebrushError(
' or an iterable of filters') "error_stream must be either a filter" " or an iterable of filters"
)
def reject_record(self, record, exception): def reject_record(self, record, exception):
if self.error_stream: if self.error_stream:
self.error_stream.run([{'record': record, self.error_stream.run([{"record": record, "exception": repr(exception)}])
'exception': repr(exception)}])
def run(self, source): def run(self, source):
if self.finished: if self.finished:
raise OvercookedError('run() called on finished recipe') raise OvercookedError("run() called on finished recipe")
# connect datapath # connect datapath
data = source data = source
@ -58,7 +58,7 @@ class Recipe(object):
def done(self): def done(self):
if self.finished: if self.finished:
raise OvercookedError('done() called on finished recipe') raise OvercookedError("done() called on finished recipe")
self.finished = True self.finished = True
@ -70,12 +70,11 @@ class Recipe(object):
try: try:
filter_.done() filter_.done()
except AttributeError: except AttributeError:
pass # don't care if there isn't a done method pass # don't care if there isn't a done method
def run_recipe(source, *filter_args, **kwargs): def run_recipe(source, *filter_args, **kwargs):
""" Process data, taking it from a source and applying any number of filters """Process data, taking it from a source and applying any number of filters"""
"""
r = Recipe(*filter_args, **kwargs) r = Recipe(*filter_args, **kwargs)
r.run(source) r.run(source)

View File

@ -2,49 +2,53 @@
Saucebrush Emitters are filters that instead of modifying the record, output Saucebrush Emitters are filters that instead of modifying the record, output
it in some manner. it in some manner.
""" """
from __future__ import unicode_literals
from saucebrush.filters import Filter from saucebrush.filters import Filter
class Emitter(Filter): class Emitter(Filter):
""" ABC for emitters """ABC for emitters
All derived emitters must provide an emit_record(self, record) that All derived emitters must provide an emit_record(self, record) that
takes a single record (python dictionary). takes a single record (python dictionary).
Emitters can optionally define a done() method that is called after Emitters can optionally define a done() method that is called after
all records are processed (allowing database flushes, or printing of all records are processed (allowing database flushes, or printing of
aggregate data). aggregate data).
""" """
def process_record(self, record): def process_record(self, record):
self.emit_record(record) self.emit_record(record)
return record return record
def emit_record(self, record): def emit_record(self, record):
""" Abstract method to be overridden. """Abstract method to be overridden.
Called with a single record, should "emit" the record unmodified. Called with a single record, should "emit" the record unmodified.
""" """
raise NotImplementedError('emit_record not defined in ' + raise NotImplementedError(
self.__class__.__name__) "emit_record not defined in " + self.__class__.__name__
)
def done(self): def done(self):
""" No-op Method to be overridden. """No-op Method to be overridden.
Called when all processing is complete Called when all processing is complete
""" """
pass pass
class DebugEmitter(Emitter): class DebugEmitter(Emitter):
""" Emitter that prints raw records to a file, useful for debugging. """Emitter that prints raw records to a file, useful for debugging.
DebugEmitter() by default prints to stdout. DebugEmitter() by default prints to stdout.
DebugEmitter(open('test', 'w')) would print to a file named test DebugEmitter(open('test', 'w')) would print to a file named test
""" """
def __init__(self, outfile=None): def __init__(self, outfile=None):
super(DebugEmitter, self).__init__() super().__init__()
if not outfile: if not outfile:
import sys import sys
self._outfile = sys.stdout self._outfile = sys.stdout
else: else:
self._outfile = outfile self._outfile = outfile
@ -54,20 +58,21 @@ class DebugEmitter(Emitter):
class CountEmitter(Emitter): class CountEmitter(Emitter):
""" Emitter that writes the record count to a file-like object. """Emitter that writes the record count to a file-like object.
CountEmitter() by default writes to stdout. CountEmitter() by default writes to stdout.
CountEmitter(outfile=open('text', 'w')) would print to a file name test. CountEmitter(outfile=open('text', 'w')) would print to a file name test.
CountEmitter(every=1000000) would write the count every 1,000,000 records. CountEmitter(every=1000000) would write the count every 1,000,000 records.
CountEmitter(every=100, of=2000) would write "<count> of 2000" every 100 records. CountEmitter(every=100, of=2000) would write "<count> of 2000" every 100 records.
""" """
def __init__(self, every=1000, of=None, outfile=None, format=None): def __init__(self, every=1000, of=None, outfile=None, format=None):
super(CountEmitter, self).__init__() super().__init__()
if not outfile: if not outfile:
import sys import sys
self._outfile = sys.stdout self._outfile = sys.stdout
else: else:
self._outfile = outfile self._outfile = outfile
@ -84,7 +89,7 @@ class CountEmitter(Emitter):
self.count = 0 self.count = 0
def format(self): def format(self):
return self._format % {'count': self.count, 'of': self._of} return self._format % {"count": self.count, "of": self._of}
def emit_record(self, record): def emit_record(self, record):
self.count += 1 self.count += 1
@ -96,15 +101,16 @@ class CountEmitter(Emitter):
class CSVEmitter(Emitter): class CSVEmitter(Emitter):
""" Emitter that writes records to a CSV file. """Emitter that writes records to a CSV file.
CSVEmitter(open('output.csv','w'), ('id', 'name', 'phone')) writes all CSVEmitter(open('output.csv','w'), ('id', 'name', 'phone')) writes all
records to a csvfile with the columns in the order specified. records to a csvfile with the columns in the order specified.
""" """
def __init__(self, csvfile, fieldnames): def __init__(self, csvfile, fieldnames):
super(CSVEmitter, self).__init__() super().__init__()
import csv import csv
self._dictwriter = csv.DictWriter(csvfile, fieldnames) self._dictwriter = csv.DictWriter(csvfile, fieldnames)
# write header row # write header row
header_row = dict(zip(fieldnames, fieldnames)) header_row = dict(zip(fieldnames, fieldnames))
@ -115,36 +121,43 @@ class CSVEmitter(Emitter):
class SqliteEmitter(Emitter): class SqliteEmitter(Emitter):
""" Emitter that writes records to a SQLite database. """Emitter that writes records to a SQLite database.
SqliteEmitter('addressbook.db', 'friend') writes all records to the SqliteEmitter('addressbook.db', 'friend') writes all records to the
friends table in the SQLite database named addressbook.db friends table in the SQLite database named addressbook.db
(To have the emitter create the table, the fieldnames should be passed (To have the emitter create the table, the fieldnames should be passed
as a third parameter to SqliteEmitter.) as a third parameter to SqliteEmitter.)
""" """
def __init__(self, dbname, table_name, fieldnames=None, replace=False, quiet=False): def __init__(self, dbname, table_name, fieldnames=None, replace=False, quiet=False):
super(SqliteEmitter, self).__init__() super().__init__()
import sqlite3 import sqlite3
self._conn = sqlite3.connect(dbname) self._conn = sqlite3.connect(dbname)
self._cursor = self._conn.cursor() self._cursor = self._conn.cursor()
self._table_name = table_name self._table_name = table_name
self._replace = replace self._replace = replace
self._quiet = quiet self._quiet = quiet
if fieldnames: if fieldnames:
create = "CREATE TABLE IF NOT EXISTS %s (%s)" % (table_name, create = "CREATE TABLE IF NOT EXISTS %s (%s)" % (
', '.join([' '.join((field, 'TEXT')) for field in fieldnames])) table_name,
", ".join([" ".join((field, "TEXT")) for field in fieldnames]),
)
self._cursor.execute(create) self._cursor.execute(create)
def emit_record(self, record): def emit_record(self, record):
import sqlite3 import sqlite3
# input should be escaped with ? if data isn't trusted # input should be escaped with ? if data isn't trusted
qmarks = ','.join(('?',) * len(record)) qmarks = ",".join(("?",) * len(record))
insert = 'INSERT OR REPLACE' if self._replace else 'INSERT' insert = "INSERT OR REPLACE" if self._replace else "INSERT"
insert = '%s INTO %s (%s) VALUES (%s)' % (insert, self._table_name, insert = "%s INTO %s (%s) VALUES (%s)" % (
','.join(record.keys()), insert,
qmarks) self._table_name,
",".join(record.keys()),
qmarks,
)
try: try:
self._cursor.execute(insert, list(record.values())) self._cursor.execute(insert, list(record.values()))
except sqlite3.IntegrityError as ie: except sqlite3.IntegrityError as ie:
@ -158,26 +171,29 @@ class SqliteEmitter(Emitter):
class SqlDumpEmitter(Emitter): class SqlDumpEmitter(Emitter):
""" Emitter that writes SQL INSERT statements. """Emitter that writes SQL INSERT statements.
The output generated by the SqlDumpEmitter is intended to be used to The output generated by the SqlDumpEmitter is intended to be used to
populate a mySQL database. populate a mySQL database.
SqlDumpEmitter(open('addresses.sql', 'w'), 'friend', ('name', 'phone')) SqlDumpEmitter(open('addresses.sql', 'w'), 'friend', ('name', 'phone'))
writes statements to addresses.sql to insert the data writes statements to addresses.sql to insert the data
into the friends table. into the friends table.
""" """
def __init__(self, outfile, table_name, fieldnames): def __init__(self, outfile, table_name, fieldnames):
super(SqlDumpEmitter, self).__init__() super().__init__()
self._fieldnames = fieldnames self._fieldnames = fieldnames
if not outfile: if not outfile:
import sys import sys
self._outfile = sys.stderr self._outfile = sys.stderr
else: else:
self._outfile = outfile self._outfile = outfile
self._insert_str = "INSERT INTO `%s` (`%s`) VALUES (%%s);\n" % ( self._insert_str = "INSERT INTO `%s` (`%s`) VALUES (%%s);\n" % (
table_name, '`,`'.join(fieldnames)) table_name,
"`,`".join(fieldnames),
)
def quote(self, item): def quote(self, item):
@ -190,29 +206,31 @@ class SqlDumpEmitter(Emitter):
types = (str,) types = (str,)
if isinstance(item, types): if isinstance(item, types):
item = item.replace("\\","\\\\").replace("'","\\'").replace(chr(0),'0') item = item.replace("\\", "\\\\").replace("'", "\\'").replace(chr(0), "0")
return "'%s'" % item return "'%s'" % item
return "%s" % item return "%s" % item
def emit_record(self, record): def emit_record(self, record):
quoted_data = [self.quote(record[field]) for field in self._fieldnames] quoted_data = [self.quote(record[field]) for field in self._fieldnames]
self._outfile.write(self._insert_str % ','.join(quoted_data)) self._outfile.write(self._insert_str % ",".join(quoted_data))
class DjangoModelEmitter(Emitter): class DjangoModelEmitter(Emitter):
""" Emitter that populates a table corresponding to a django model. """Emitter that populates a table corresponding to a django model.
Takes a django settings file, app label and model name and uses django Takes a django settings file, app label and model name and uses django
to insert the records into the appropriate table. to insert the records into the appropriate table.
DjangoModelEmitter('settings.py', 'addressbook', 'friend') writes DjangoModelEmitter('settings.py', 'addressbook', 'friend') writes
records to addressbook.models.friend model using database settings records to addressbook.models.friend model using database settings
from settings.py. from settings.py.
""" """
def __init__(self, dj_settings, app_label, model_name): def __init__(self, dj_settings, app_label, model_name):
super(DjangoModelEmitter, self).__init__() super().__init__()
from saucebrush.utils import get_django_model from saucebrush.utils import get_django_model
self._dbmodel = get_django_model(dj_settings, app_label, model_name) self._dbmodel = get_django_model(dj_settings, app_label, model_name)
if not self._dbmodel: if not self._dbmodel:
raise Exception("No such model: %s %s" % (app_label, model_name)) raise Exception("No such model: %s %s" % (app_label, model_name))
@ -222,19 +240,30 @@ class DjangoModelEmitter(Emitter):
class MongoDBEmitter(Emitter): class MongoDBEmitter(Emitter):
""" Emitter that creates a document in a MongoDB datastore """Emitter that creates a document in a MongoDB datastore
The names of the database and collection in which the records will The names of the database and collection in which the records will
be inserted are required parameters. The host and port are optional, be inserted are required parameters. The host and port are optional,
defaulting to 'localhost' and 27017, repectively. defaulting to 'localhost' and 27017, repectively.
""" """
def __init__(self, database, collection, host='localhost', port=27017, drop_collection=False, conn=None):
super(MongoDBEmitter, self).__init__() def __init__(
self,
database,
collection,
host="localhost",
port=27017,
drop_collection=False,
conn=None,
):
super().__init__()
from pymongo.database import Database from pymongo.database import Database
if not isinstance(database, Database): if not isinstance(database, Database):
if not conn: if not conn:
from pymongo.connection import Connection from pymongo.connection import Connection
conn = Connection(host, port) conn = Connection(host, port)
db = conn[database] db = conn[database]
else: else:
@ -249,16 +278,17 @@ class MongoDBEmitter(Emitter):
class LoggingEmitter(Emitter): class LoggingEmitter(Emitter):
""" Emitter that logs to a Python logging.Logger instance. """Emitter that logs to a Python logging.Logger instance.
The msg_template will be passed the record being emitted as The msg_template will be passed the record being emitted as
a format parameter. The resulting message will get logged a format parameter. The resulting message will get logged
at the provided level. at the provided level.
""" """
import logging import logging
def __init__(self, logger, msg_template, level=logging.DEBUG): def __init__(self, logger, msg_template, level=logging.DEBUG):
super(LoggingEmitter, self).__init__() super().__init__()
self.logger = logger self.logger = logger
self.msg_template = msg_template self.msg_template = msg_template
self.level = level self.level = level

View File

@ -12,26 +12,28 @@ import re
import time import time
###################### ######################
## Abstract Filters ## # Abstract Filters #
###################### ######################
class Filter(object):
""" ABC for filters that operate on records.
All derived filters must provide a process_record(self, record) that class Filter:
takes a single record (python dictionary) and returns a result. """ABC for filters that operate on records.
All derived filters must provide a process_record(self, record) that
takes a single record (python dictionary) and returns a result.
""" """
def process_record(self, record): def process_record(self, record):
""" Abstract method to be overridden. """Abstract method to be overridden.
Called with a single record, should return modified record. Called with a single record, should return modified record.
""" """
raise NotImplementedError('process_record not defined in ' + raise NotImplementedError(
self.__class__.__name__) "process_record not defined in " + self.__class__.__name__
)
def reject_record(self, record, exception): def reject_record(self, record, exception):
recipe = getattr(self, '_recipe') recipe = getattr(self, "_recipe")
if recipe: if recipe:
recipe.reject_record(record, exception) recipe.reject_record(record, exception)
@ -47,11 +49,11 @@ class Filter(object):
class YieldFilter(Filter): class YieldFilter(Filter):
""" ABC for defining filters where process_record yields. """ABC for defining filters where process_record yields.
If process_record cannot return exactly one result for every record If process_record cannot return exactly one result for every record
it is passed, it should yield back as many records as needed and the it is passed, it should yield back as many records as needed and the
filter must derive from YieldFilter. filter must derive from YieldFilter.
""" """
def attach(self, source, recipe=None): def attach(self, source, recipe=None):
@ -65,19 +67,19 @@ class YieldFilter(Filter):
class FieldFilter(Filter): class FieldFilter(Filter):
""" ABC for filters that do a single operation on individual fields. """ABC for filters that do a single operation on individual fields.
All derived filters must provide a process_field(self, item) that All derived filters must provide a process_field(self, item) that
returns a modified item. process_field is called on one or more keys returns a modified item. process_field is called on one or more keys
passed into __init__. passed into __init__.
""" """
def __init__(self, keys): def __init__(self, keys):
super(FieldFilter, self).__init__() super().__init__()
self._target_keys = utils.str_or_list(keys) self._target_keys = utils.str_or_list(keys)
def process_record(self, record): def process_record(self, record):
""" Calls process_field on all keys passed to __init__. """ """Calls process_field on all keys passed to __init__."""
for key in self._target_keys: for key in self._target_keys:
try: try:
@ -89,29 +91,31 @@ class FieldFilter(Filter):
return record return record
def process_field(self, item): def process_field(self, item):
""" Given a value, return the value that it should be replaced with. """ """Given a value, return the value that it should be replaced with."""
raise NotImplementedError('process_field not defined in ' + raise NotImplementedError(
self.__class__.__name__) "process_field not defined in " + self.__class__.__name__
)
def __unicode__(self): def __unicode__(self):
return '%s( %s )' % (self.__class__.__name__, str(self._target_keys)) return "%s( %s )" % (self.__class__.__name__, str(self._target_keys))
class ConditionalFilter(YieldFilter): class ConditionalFilter(YieldFilter):
""" ABC for filters that only pass through records meeting a condition. """ABC for filters that only pass through records meeting a condition.
All derived filters must provide a test_record(self, record) that All derived filters must provide a test_record(self, record) that
returns True or False -- True indicating that the record should be returns True or False -- True indicating that the record should be
passed through, and False preventing pass through. passed through, and False preventing pass through.
If validator is True then raises a ValidationError instead of If validator is True then raises a ValidationError instead of
silently dropping records that fail test_record. silently dropping records that fail test_record.
""" """
validator = False validator = False
def process_record(self, record): def process_record(self, record):
""" Yields all records for which self.test_record is true """ """Yields all records for which self.test_record is true"""
if self.test_record(record): if self.test_record(record):
yield record yield record
@ -119,41 +123,45 @@ class ConditionalFilter(YieldFilter):
raise ValidationError(record) raise ValidationError(record)
def test_record(self, record): def test_record(self, record):
""" Given a record, return True iff it should be passed on """ """Given a record, return True iff it should be passed on"""
raise NotImplementedError('test_record not defined in ' + raise NotImplementedError(
self.__class__.__name__) "test_record not defined in " + self.__class__.__name__
)
class ValidationError(Exception): class ValidationError(Exception):
def __init__(self, record): def __init__(self, record):
super(ValidationError, self).__init__(repr(record)) super().__init__(repr(record))
self.record = record self.record = record
def _dotted_get(d, path): def _dotted_get(d, path):
""" """
utility function for SubrecordFilter utility function for SubrecordFilter
dives into a complex nested dictionary with paths like a.b.c dives into a complex nested dictionary with paths like a.b.c
""" """
if path: if path:
key_pieces = path.split('.', 1) key_pieces = path.split(".", 1)
piece = d[key_pieces[0]] piece = d[key_pieces[0]]
if isinstance(piece, (tuple, list)): if isinstance(piece, (tuple, list)):
return [_dotted_get(i, '.'.join(key_pieces[1:])) for i in piece] return [_dotted_get(i, ".".join(key_pieces[1:])) for i in piece]
elif isinstance(piece, (dict)): elif isinstance(piece, (dict)):
return _dotted_get(piece, '.'.join(key_pieces[1:])) return _dotted_get(piece, ".".join(key_pieces[1:]))
else: else:
return d return d
class SubrecordFilter(Filter):
""" Filter that calls another filter on subrecord(s) of a record
Takes a dotted path (eg. a.b.c) and instantiated filter and runs that class SubrecordFilter(Filter):
filter on all subrecords found at the path. """Filter that calls another filter on subrecord(s) of a record
Takes a dotted path (eg. a.b.c) and instantiated filter and runs that
filter on all subrecords found at the path.
""" """
def __init__(self, field_path, filter_): def __init__(self, field_path, filter_):
if '.' in field_path: if "." in field_path:
self.field_path, self.key = field_path.rsplit('.', 1) self.field_path, self.key = field_path.rsplit(".", 1)
else: else:
self.field_path = None self.field_path = None
self.key = field_path self.key = field_path
@ -178,8 +186,9 @@ class SubrecordFilter(Filter):
self.process_subrecord(subrecord_parent) self.process_subrecord(subrecord_parent)
return record return record
class ConditionalPathFilter(Filter): class ConditionalPathFilter(Filter):
""" Filter that uses a predicate to split input among two filter paths. """ """Filter that uses a predicate to split input among two filter paths."""
def __init__(self, predicate_func, true_filter, false_filter): def __init__(self, predicate_func, true_filter, false_filter):
self.predicate_func = predicate_func self.predicate_func = predicate_func
@ -192,38 +201,43 @@ class ConditionalPathFilter(Filter):
else: else:
return self.false_filter.process_record(record) return self.false_filter.process_record(record)
##################### #####################
## Generic Filters ## # Generic Filters #
##################### #####################
class FieldModifier(FieldFilter): class FieldModifier(FieldFilter):
""" Filter that calls a given function on a given set of fields. """Filter that calls a given function on a given set of fields.
FieldModifier(('spam','eggs'), abs) to call the abs method on the spam FieldModifier(('spam','eggs'), abs) to call the abs method on the spam
and eggs fields in each record filtered. and eggs fields in each record filtered.
""" """
def __init__(self, keys, func): def __init__(self, keys, func):
super(FieldModifier, self).__init__(keys) super().__init__(keys)
self._filter_func = func self._filter_func = func
def process_field(self, item): def process_field(self, item):
return self._filter_func(item) return self._filter_func(item)
def __unicode__(self): def __str__(self):
return '%s( %s, %s )' % (self.__class__.__name__, return "%s( %s, %s )" % (
str(self._target_keys), str(self._filter_func)) self.__class__.__name__,
str(self._target_keys),
str(self._filter_func),
)
class FieldKeeper(Filter): class FieldKeeper(Filter):
""" Filter that removes all but the given set of fields. """Filter that removes all but the given set of fields.
FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs
fields from every record filtered. fields from every record filtered.
""" """
def __init__(self, keys): def __init__(self, keys):
super(FieldKeeper, self).__init__() super().__init__()
self._target_keys = utils.str_or_list(keys) self._target_keys = utils.str_or_list(keys)
def process_record(self, record): def process_record(self, record):
@ -234,14 +248,14 @@ class FieldKeeper(Filter):
class FieldRemover(Filter): class FieldRemover(Filter):
""" Filter that removes a given set of fields. """Filter that removes a given set of fields.
FieldRemover(('spam', 'eggs')) removes the spam and eggs fields from FieldRemover(('spam', 'eggs')) removes the spam and eggs fields from
every record filtered. every record filtered.
""" """
def __init__(self, keys): def __init__(self, keys):
super(FieldRemover, self).__init__() super().__init__()
self._target_keys = utils.str_or_list(keys) self._target_keys = utils.str_or_list(keys)
def process_record(self, record): def process_record(self, record):
@ -249,21 +263,21 @@ class FieldRemover(Filter):
record.pop(key, None) record.pop(key, None)
return record return record
def __unicode__(self): def __str__(self):
return '%s( %s )' % (self.__class__.__name__, str(self._target_keys)) return "%s( %s )" % (self.__class__.__name__, str(self._target_keys))
class FieldMerger(Filter): class FieldMerger(Filter):
""" Filter that merges a given set of fields using a supplied merge_func. """Filter that merges a given set of fields using a supplied merge_func.
Takes a mapping (dictionary of new_column:(from_col1,from_col2)) Takes a mapping (dictionary of new_column:(from_col1,from_col2))
FieldMerger({"bacon": ("spam", "eggs")}, operator.add) creates a new FieldMerger({"bacon": ("spam", "eggs")}, operator.add) creates a new
column bacon that is the result of spam+eggs column bacon that is the result of spam+eggs
""" """
def __init__(self, mapping, merge_func, keep_fields=False): def __init__(self, mapping, merge_func, keep_fields=False):
super(FieldMerger, self).__init__() super().__init__()
self._field_mapping = mapping self._field_mapping = mapping
self._merge_func = merge_func self._merge_func = merge_func
self._keep_fields = keep_fields self._keep_fields = keep_fields
@ -277,30 +291,32 @@ class FieldMerger(Filter):
record[to_col] = self._merge_func(*values) record[to_col] = self._merge_func(*values)
return record return record
def __unicode__(self): def __str__(self):
return '%s( %s, %s )' % (self.__class__.__name__, return "%s( %s, %s )" % (
str(self._field_mapping), self.__class__.__name__,
str(self._merge_func)) str(self._field_mapping),
str(self._merge_func),
)
class FieldAdder(Filter): class FieldAdder(Filter):
""" Filter that adds a new field. """Filter that adds a new field.
Takes a name for the new field and a value, field_value can be an Takes a name for the new field and a value, field_value can be an
iterable, a function, or a static value. iterable, a function, or a static value.
from itertools import count from itertools import count
FieldAdder('id', count) FieldAdder('id', count)
would yield a new column named id that uses the itertools count iterable would yield a new column named id that uses the itertools count iterable
to create sequentially numbered ids. to create sequentially numbered ids.
""" """
def __init__(self, field_name, field_value, replace=True): def __init__(self, field_name, field_value, replace=True):
super(FieldAdder, self).__init__() super().__init__()
self._field_name = field_name self._field_name = field_name
self._field_value = field_value self._field_value = field_value
if hasattr(self._field_value, '__iter__'): if hasattr(self._field_value, "__iter__"):
value_iter = iter(self._field_value) value_iter = iter(self._field_value)
if hasattr(value_iter, "next"): if hasattr(value_iter, "next"):
self._field_value = value_iter.next self._field_value = value_iter.next
@ -317,17 +333,22 @@ class FieldAdder(Filter):
return record return record
def __unicode__(self): def __unicode__(self):
return '%s( %s, %s )' % (self.__class__.__name__, self._field_name, return "%s( %s, %s )" % (
str(self._field_value)) self.__class__.__name__,
self._field_name,
str(self._field_value),
)
class FieldCopier(Filter): class FieldCopier(Filter):
""" Filter that copies one field to another. """Filter that copies one field to another.
Takes a dictionary mapping destination keys to source keys. Takes a dictionary mapping destination keys to source keys.
""" """
def __init__(self, copy_mapping): def __init__(self, copy_mapping):
super(FieldCopier, self).__init__() super().__init__()
self._copy_mapping = copy_mapping self._copy_mapping = copy_mapping
def process_record(self, record): def process_record(self, record):
@ -336,13 +357,15 @@ class FieldCopier(Filter):
record[dest] = record[source] record[dest] = record[source]
return record return record
class FieldRenamer(Filter):
""" Filter that renames one field to another.
Takes a dictionary mapping destination keys to source keys. class FieldRenamer(Filter):
"""Filter that renames one field to another.
Takes a dictionary mapping destination keys to source keys.
""" """
def __init__(self, rename_mapping): def __init__(self, rename_mapping):
super(FieldRenamer, self).__init__() super().__init__()
self._rename_mapping = rename_mapping self._rename_mapping = rename_mapping
def process_record(self, record): def process_record(self, record):
@ -351,15 +374,16 @@ class FieldRenamer(Filter):
record[dest] = record.pop(source) record[dest] = record.pop(source)
return record return record
class FieldNameModifier(Filter):
""" Filter that calls a given function on a given set of fields.
FieldNameModifier(('spam','eggs'), abs) to call the abs method on the spam class FieldNameModifier(Filter):
and eggs field names in each record filtered. """Filter that calls a given function on a given set of fields.
FieldNameModifier(('spam','eggs'), abs) to call the abs method on the spam
and eggs field names in each record filtered.
""" """
def __init__(self, func): def __init__(self, func):
super(FieldNameModifier, self).__init__() super().__init__()
self._filter_func = func self._filter_func = func
def process_record(self, record): def process_record(self, record):
@ -368,19 +392,20 @@ class FieldNameModifier(Filter):
record[dest] = record.pop(source) record[dest] = record.pop(source)
return record return record
class Splitter(Filter): class Splitter(Filter):
""" Filter that splits nested data into different paths. """Filter that splits nested data into different paths.
Takes a dictionary of keys and a series of filters to run against the Takes a dictionary of keys and a series of filters to run against the
associated dictionaries. associated dictionaries.
{'person': {'firstname': 'James', 'lastname': 'Turk'}, {'person': {'firstname': 'James', 'lastname': 'Turk'},
'phones': [{'phone': '222-222-2222'}, {'phone': '335-333-3321'}] 'phones': [{'phone': '222-222-2222'}, {'phone': '335-333-3321'}]
} }
""" """
def __init__(self, split_mapping): def __init__(self, split_mapping):
super(Splitter, self).__init__() super().__init__()
self._split_mapping = split_mapping self._split_mapping = split_mapping
def process_record(self, record): def process_record(self, record):
@ -409,21 +434,22 @@ class Splitter(Filter):
class Flattener(FieldFilter): class Flattener(FieldFilter):
""" Collapse a set of similar dictionaries into a list. """Collapse a set of similar dictionaries into a list.
Takes a dictionary of keys and flattens the key names: Takes a dictionary of keys and flattens the key names:
addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}}, addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}},
{'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}] {'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}]
flattener = Flattener(['addresses']) flattener = Flattener(['addresses'])
would yield: would yield:
{'addresses': [{'state': 'NC', 'street': '146 shirley drive'}, {'addresses': [{'state': 'NC', 'street': '146 shirley drive'},
{'state': 'NY', 'street': '3000 Winton Rd'}]} {'state': 'NY', 'street': '3000 Winton Rd'}]}
""" """
def __init__(self, keys): def __init__(self, keys):
super(Flattener, self).__init__(keys) super().__init__(keys)
def process_field(self, item): def process_field(self, item):
result = [] result = []
@ -436,8 +462,8 @@ class Flattener(FieldFilter):
class DictFlattener(Filter): class DictFlattener(Filter):
def __init__(self, keys, separator='_'): def __init__(self, keys, separator="_"):
super(DictFlattener, self).__init__() super().__init__()
self._keys = utils.str_or_list(keys) self._keys = utils.str_or_list(keys)
self._separator = separator self._separator = separator
@ -446,11 +472,10 @@ class DictFlattener(Filter):
class Unique(ConditionalFilter): class Unique(ConditionalFilter):
""" Filter that ensures that all records passing through are unique. """Filter that ensures that all records passing through are unique."""
"""
def __init__(self): def __init__(self):
super(Unique, self).__init__() super().__init__()
self._seen = set() self._seen = set()
def test_record(self, record): def test_record(self, record):
@ -461,19 +486,20 @@ class Unique(ConditionalFilter):
else: else:
return False return False
class UniqueValidator(Unique): class UniqueValidator(Unique):
validator = True validator = True
class UniqueID(ConditionalFilter): class UniqueID(ConditionalFilter):
""" Filter that ensures that all records through have a unique ID. """Filter that ensures that all records through have a unique ID.
Takes the name of an ID field, or multiple field names in the case Takes the name of an ID field, or multiple field names in the case
of a composite ID. of a composite ID.
""" """
def __init__(self, field='id', *args): def __init__(self, field="id", *args):
super(UniqueID, self).__init__() super().__init__()
self._seen = set() self._seen = set()
self._id_fields = [field] self._id_fields = [field]
self._id_fields.extend(args) self._id_fields.extend(args)
@ -486,58 +512,30 @@ class UniqueID(ConditionalFilter):
else: else:
return False return False
class UniqueIDValidator(UniqueID): class UniqueIDValidator(UniqueID):
validator = True validator = True
class UnicodeFilter(Filter):
""" Convert all str elements in the record to Unicode.
"""
def __init__(self, encoding='utf-8', errors='ignore'):
super(UnicodeFilter, self).__init__()
self._encoding = encoding
self._errors = errors
def process_record(self, record):
for key, value in record.items():
if isinstance(value, str):
record[key] = unicode(value, self._encoding, self._errors)
elif isinstance(value, unicode):
record[key] = value.decode(self._encoding, self._errors)
return record
class StringFilter(Filter):
def __init__(self, encoding='utf-8', errors='ignore'):
super(StringFilter, self).__init__()
self._encoding = encoding
self._errors = errors
def process_record(self, record):
for key, value in record.items():
if isinstance(value, unicode):
record[key] = value.encode(self._encoding, self._errors)
return record
########################### ###########################
## Commonly Used Filters ## # Commonly Used Filters #
########################### ###########################
class PhoneNumberCleaner(FieldFilter): class PhoneNumberCleaner(FieldFilter):
""" Filter that cleans phone numbers to match a given format. """Filter that cleans phone numbers to match a given format.
Takes a list of target keys and an optional phone # format that has Takes a list of target keys and an optional phone # format that has
10 %s placeholders. 10 %s placeholders.
PhoneNumberCleaner( ('phone','fax'), number_format='%s%s%s-%s%s%s-%s%s%s%s') PhoneNumberCleaner( ('phone','fax'), number_format='%s%s%s-%s%s%s-%s%s%s%s')
would format the phone & fax columns to 555-123-4567 format. would format the phone & fax columns to 555-123-4567 format.
""" """
def __init__(self, keys, number_format='%s%s%s.%s%s%s.%s%s%s%s'):
super(PhoneNumberCleaner, self).__init__(keys) def __init__(self, keys, number_format="%s%s%s.%s%s%s.%s%s%s%s"):
super().__init__(keys)
self._number_format = number_format self._number_format = number_format
self._num_re = re.compile('\d') self._num_re = re.compile(r"\d")
def process_field(self, item): def process_field(self, item):
nums = self._num_re.findall(item) nums = self._num_re.findall(item)
@ -545,46 +543,54 @@ class PhoneNumberCleaner(FieldFilter):
item = self._number_format % tuple(nums) item = self._number_format % tuple(nums)
return item return item
class DateCleaner(FieldFilter):
""" Filter that cleans dates to match a given format.
Takes a list of target keys and to and from formats in strftime format. class DateCleaner(FieldFilter):
"""Filter that cleans dates to match a given format.
Takes a list of target keys and to and from formats in strftime format.
""" """
def __init__(self, keys, from_format, to_format): def __init__(self, keys, from_format, to_format):
super(DateCleaner, self).__init__(keys) super().__init__(keys)
self._from_format = from_format self._from_format = from_format
self._to_format = to_format self._to_format = to_format
def process_field(self, item): def process_field(self, item):
return time.strftime(self._to_format, return time.strftime(self._to_format, time.strptime(item, self._from_format))
time.strptime(item, self._from_format))
class NameCleaner(Filter): class NameCleaner(Filter):
""" Filter that splits names into a first, last, and middle name field. """Filter that splits names into a first, last, and middle name field.
Takes a list of target keys. Takes a list of target keys.
NameCleaner( ('name', ), nomatch_name='raw_name') NameCleaner( ('name', ), nomatch_name='raw_name')
would attempt to split 'name' into firstname, middlename, lastname, would attempt to split 'name' into firstname, middlename, lastname,
and suffix columns, and if it did not fit would place it in raw_name and suffix columns, and if it did not fit would place it in raw_name
""" """
# first middle? last suffix? # first middle? last suffix?
FIRST_LAST = re.compile('''^\s*(?:(?P<firstname>\w+)(?:\.?) FIRST_LAST = re.compile(
r"""^\s*(?:(?P<firstname>\w+)(?:\.?)
\s+(?:(?P<middlename>\w+)\.?\s+)? \s+(?:(?P<middlename>\w+)\.?\s+)?
(?P<lastname>[A-Za-z'-]+)) (?P<lastname>[A-Za-z'-]+))
(?:\s+(?P<suffix>JR\.?|II|III|IV))? (?:\s+(?P<suffix>JR\.?|II|III|IV))?
\s*$''', re.VERBOSE | re.IGNORECASE) \s*$""",
re.VERBOSE | re.IGNORECASE,
)
# last, first middle? suffix? # last, first middle? suffix?
LAST_FIRST = re.compile('''^\s*(?:(?P<lastname>[A-Za-z'-]+), LAST_FIRST = re.compile(
r"""^\s*(?:(?P<lastname>[A-Za-z'-]+),
\s+(?P<firstname>\w+)(?:\.?) \s+(?P<firstname>\w+)(?:\.?)
(?:\s+(?P<middlename>\w+)\.?)?) (?:\s+(?P<middlename>\w+)\.?)?)
(?:\s+(?P<suffix>JR\.?|II|III|IV))? (?:\s+(?P<suffix>JR\.?|II|III|IV))?
\s*$''', re.VERBOSE | re.IGNORECASE) \s*$""",
re.VERBOSE | re.IGNORECASE,
)
def __init__(self, keys, prefix='', formats=None, nomatch_name=None): def __init__(self, keys, prefix="", formats=None, nomatch_name=None):
super(NameCleaner, self).__init__() super().__init__()
self._keys = utils.str_or_list(keys) self._keys = utils.str_or_list(keys)
self._name_prefix = prefix self._name_prefix = prefix
self._nomatch_name = nomatch_name self._nomatch_name = nomatch_name
@ -605,7 +611,7 @@ class NameCleaner(Filter):
# if there is a match, remove original name and add pieces # if there is a match, remove original name and add pieces
if match: if match:
record.pop(key) record.pop(key)
for k,v in match.groupdict().items(): for k, v in match.groupdict().items():
record[self._name_prefix + k] = v record[self._name_prefix + k] = v
break break

View File

@ -4,27 +4,28 @@
All sources must implement the iterable interface and return python All sources must implement the iterable interface and return python
dictionaries. dictionaries.
""" """
from __future__ import unicode_literals
import string import string
from saucebrush import utils from saucebrush import utils
class CSVSource(object):
""" Saucebrush source for reading from CSV files.
Takes an open csvfile, an optional set of fieldnames and optional number class CSVSource:
of rows to skip. """Saucebrush source for reading from CSV files.
CSVSource(open('test.csv')) will read a csvfile, using the first row as Takes an open csvfile, an optional set of fieldnames and optional number
the field names. of rows to skip.
CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read CSVSource(open('test.csv')) will read a csvfile, using the first row as
in a CSV file and treat the three columns as name, phone, and address, the field names.
ignoring the first row (presumed to be column names).
CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read
in a CSV file and treat the three columns as name, phone, and address,
ignoring the first row (presumed to be column names).
""" """
def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs): def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs):
import csv import csv
self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs) self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs)
for _ in range(skiprows): for _ in range(skiprows):
next(self._dictreader) next(self._dictreader)
@ -33,17 +34,17 @@ class CSVSource(object):
return self._dictreader return self._dictreader
class FixedWidthFileSource(object): class FixedWidthFileSource:
""" Saucebrush source for reading from fixed width field files. """Saucebrush source for reading from fixed width field files.
FixedWidthFileSource expects an open fixed width file and a tuple FixedWidthFileSource expects an open fixed width file and a tuple
of fields with their lengths. There is also an optional fillchars of fields with their lengths. There is also an optional fillchars
command that is the filler characters to strip from the end of each command that is the filler characters to strip from the end of each
field. (defaults to whitespace) field. (defaults to whitespace)
FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12))) FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12)))
will read in a fixed width file where the first 30 characters of each will read in a fixed width file where the first 30 characters of each
line are part of a name and the characters 31-42 are a phone number. line are part of a name and the characters 31-42 are a phone number.
""" """
def __init__(self, fwfile, fields, fillchars=string.whitespace): def __init__(self, fwfile, fields, fillchars=string.whitespace):
@ -64,97 +65,98 @@ class FixedWidthFileSource(object):
line = next(self._fwfile) line = next(self._fwfile)
record = {} record = {}
for name, range_ in self._fields_dict.items(): for name, range_ in self._fields_dict.items():
record[name] = line[range_[0]:range_[1]].rstrip(self._fillchars) record[name] = line[range_[0] : range_[1]].rstrip(self._fillchars)
return record return record
def next(self):
""" Keep Python 2 next() method that defers to __next__().
"""
return self.__next__()
class HtmlTableSource:
"""Saucebrush source for reading data from an HTML table.
class HtmlTableSource(object): HtmlTableSource expects an open html file, the id of the table or a
""" Saucebrush source for reading data from an HTML table. number indicating which table on the page to use, an optional fieldnames
tuple, and an optional number of rows to skip.
HtmlTableSource expects an open html file, the id of the table or a HtmlTableSource(open('test.html'), 0) opens the first HTML table and
number indicating which table on the page to use, an optional fieldnames uses the first row as the names of the columns.
tuple, and an optional number of rows to skip.
HtmlTableSource(open('test.html'), 0) opens the first HTML table and HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens
uses the first row as the names of the columns. the HTML table with an id of 'people' and names the two columns
name and phone, skipping the first row where alternate names are
HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens stored.
the HTML table with an id of 'people' and names the two columns
name and phone, skipping the first row where alternate names are
stored.
""" """
def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0): def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
# extract the table # extract the table
from lxml.html import parse from lxml.html import parse
doc = parse(htmlfile).getroot() doc = parse(htmlfile).getroot()
if isinstance(id_or_num, int): if isinstance(id_or_num, int):
table = doc.cssselect('table')[id_or_num] table = doc.cssselect("table")[id_or_num]
else: else:
table = doc.cssselect('table#%s' % id_or_num) table = doc.cssselect("table#%s" % id_or_num)
table = table[0] # get the first table table = table[0] # get the first table
# skip the necessary number of rows # skip the necessary number of rows
self._rows = table.cssselect('tr')[skiprows:] self._rows = table.cssselect("tr")[skiprows:]
# determine the fieldnames # determine the fieldnames
if not fieldnames: if not fieldnames:
self._fieldnames = [td.text_content() self._fieldnames = [
for td in self._rows[0].cssselect('td, th')] td.text_content() for td in self._rows[0].cssselect("td, th")
]
skiprows += 1 skiprows += 1
else: else:
self._fieldnames = fieldnames self._fieldnames = fieldnames
# skip the necessary number of rows # skip the necessary number of rows
self._rows = table.cssselect('tr')[skiprows:] self._rows = table.cssselect("tr")[skiprows:]
def process_tr(self): def process_tr(self):
for row in self._rows: for row in self._rows:
strings = [td.text_content() for td in row.cssselect('td')] strings = [td.text_content() for td in row.cssselect("td")]
yield dict(zip(self._fieldnames, strings)) yield dict(zip(self._fieldnames, strings))
def __iter__(self): def __iter__(self):
return self.process_tr() return self.process_tr()
class DjangoModelSource(object): class DjangoModelSource:
""" Saucebrush source for reading data from django models. """Saucebrush source for reading data from django models.
DjangoModelSource expects a django settings file, app label, and model DjangoModelSource expects a django settings file, app label, and model
name. The resulting records contain all columns in the table for the name. The resulting records contain all columns in the table for the
specified model. specified model.
DjangoModelSource('settings.py', 'phonebook', 'friend') would read all DjangoModelSource('settings.py', 'phonebook', 'friend') would read all
friends from the friend model in the phonebook app described in friends from the friend model in the phonebook app described in
settings.py. settings.py.
""" """
def __init__(self, dj_settings, app_label, model_name): def __init__(self, dj_settings, app_label, model_name):
dbmodel = utils.get_django_model(dj_settings, app_label, model_name) dbmodel = utils.get_django_model(dj_settings, app_label, model_name)
# only get values defined in model (no extra fields from custom manager) # only get values defined in model (no extra fields from custom manager)
self._data = dbmodel.objects.values(*[f.name self._data = dbmodel.objects.values(*[f.name for f in dbmodel._meta.fields])
for f in dbmodel._meta.fields])
def __iter__(self): def __iter__(self):
return iter(self._data) return iter(self._data)
class MongoDBSource(object): class MongoDBSource:
""" Source for reading from a MongoDB database. """Source for reading from a MongoDB database.
The record dict is populated with records matching the spec The record dict is populated with records matching the spec
from the specified database and collection. from the specified database and collection.
""" """
def __init__(self, database, collection, spec=None, host='localhost', port=27017, conn=None):
def __init__(
self, database, collection, spec=None, host="localhost", port=27017, conn=None
):
if not conn: if not conn:
from pymongo.connection import Connection from pymongo.connection import Connection
conn = Connection(host, port) conn = Connection(host, port)
self.collection = conn[database][collection] self.collection = conn[database][collection]
self.spec = spec self.spec = spec
@ -166,19 +168,21 @@ class MongoDBSource(object):
for doc in self.collection.find(self.spec): for doc in self.collection.find(self.spec):
yield dict(doc) yield dict(doc)
# dict_factory for sqlite source # dict_factory for sqlite source
def dict_factory(cursor, row): def dict_factory(cursor, row):
d = { } d = {}
for idx, col in enumerate(cursor.description): for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx] d[col[0]] = row[idx]
return d return d
class SqliteSource(object):
""" Source that reads from a sqlite database.
The record dict is populated with the results from the class SqliteSource:
query argument. If given, args will be passed to the query """Source that reads from a sqlite database.
when executed.
The record dict is populated with the results from the
query argument. If given, args will be passed to the query
when executed.
""" """
def __init__(self, dbpath, query, args=None, conn_params=None): def __init__(self, dbpath, query, args=None, conn_params=None):
@ -213,11 +217,11 @@ class SqliteSource(object):
self._conn.close() self._conn.close()
class FileSource(object): class FileSource:
""" Base class for sources which read from one or more files. """Base class for sources which read from one or more files.
Takes as input a file-like, a file path, a list of file-likes, Takes as input a file-like, a file path, a list of file-likes,
or a list of file paths. or a list of file paths.
""" """
def __init__(self, input): def __init__(self, input):
@ -226,34 +230,36 @@ class FileSource(object):
def __iter__(self): def __iter__(self):
# This method would be a lot cleaner with the proposed # This method would be a lot cleaner with the proposed
# 'yield from' expression (PEP 380) # 'yield from' expression (PEP 380)
if hasattr(self._input, '__read__') or hasattr(self._input, 'read'): if hasattr(self._input, "__read__") or hasattr(self._input, "read"):
for record in self._process_file(self._input): for record in self._process_file(self._input):
yield record yield record
elif isinstance(self._input, str): elif isinstance(self._input, str):
with open(self._input) as f: with open(self._input) as f:
for record in self._process_file(f): for record in self._process_file(f):
yield record yield record
elif hasattr(self._input, '__iter__'): elif hasattr(self._input, "__iter__"):
for el in self._input: for el in self._input:
if isinstance(el, str): if isinstance(el, str):
with open(el) as f: with open(el) as f:
for record in self._process_file(f): for record in self._process_file(f):
yield record yield record
elif hasattr(el, '__read__') or hasattr(el, 'read'): elif hasattr(el, "__read__") or hasattr(el, "read"):
for record in self._process_file(f): for record in self._process_file(f):
yield record yield record
def _process_file(self, file): def _process_file(self, file):
raise NotImplementedError('Descendants of FileSource should implement' raise NotImplementedError(
' a custom _process_file method.') "Descendants of FileSource should implement"
" a custom _process_file method."
)
class JSONSource(FileSource): class JSONSource(FileSource):
""" Source for reading from JSON files. """Source for reading from JSON files.
When processing JSON files, if the top-level object is a list, will When processing JSON files, if the top-level object is a list, will
yield each member separately. Otherwise, yields the top-level yield each member separately. Otherwise, yields the top-level
object. object.
""" """
def _process_file(self, f): def _process_file(self, f):
@ -271,36 +277,37 @@ class JSONSource(FileSource):
else: else:
yield obj yield obj
class XMLSource(FileSource):
""" Source for reading from XML files. Use with the same kind of caution
that you use to approach anything written in XML.
When processing XML files, if the top-level object is a list, will class XMLSource(FileSource):
yield each member separately, unless the dotted path to a list is """Source for reading from XML files. Use with the same kind of caution
included. you can also do this with a SubrecordFilter, but XML is that you use to approach anything written in XML.
almost never going to be useful at the top level.
When processing XML files, if the top-level object is a list, will
yield each member separately, unless the dotted path to a list is
included. you can also do this with a SubrecordFilter, but XML is
almost never going to be useful at the top level.
""" """
def __init__(self, input, node_path=None, attr_prefix='ATTR_', def __init__(self, input, node_path=None, attr_prefix="ATTR_", postprocessor=None):
postprocessor=None): super().__init__(input)
super(XMLSource, self).__init__(input) self.node_list = node_path.split(".")
self.node_list = node_path.split('.')
self.attr_prefix = attr_prefix self.attr_prefix = attr_prefix
self.postprocessor = postprocessor self.postprocessor = postprocessor
def _process_file(self, f, attr_prefix='ATTR_'): def _process_file(self, f, attr_prefix="ATTR_"):
"""xmltodict can either return attributes of nodes as prefixed fields """xmltodict can either return attributes of nodes as prefixed fields
(prefixes to avoid key collisions), or ignore them altogether. (prefixes to avoid key collisions), or ignore them altogether.
set attr prefix to whatever you want. Setting it to False ignores set attr prefix to whatever you want. Setting it to False ignores
attributes. attributes.
""" """
import xmltodict import xmltodict
if self.postprocessor: if self.postprocessor:
obj = xmltodict.parse(f, attr_prefix=self.attr_prefix, obj = xmltodict.parse(
postprocessor=self.postprocessor) f, attr_prefix=self.attr_prefix, postprocessor=self.postprocessor
)
else: else:
obj = xmltodict.parse(f, attr_prefix=self.attr_prefix) obj = xmltodict.parse(f, attr_prefix=self.attr_prefix)
@ -308,7 +315,7 @@ class XMLSource(FileSource):
if self.node_list: if self.node_list:
for node in self.node_list: for node in self.node_list:
obj = obj[node] obj = obj[node]
# If the top-level XML object in the file is a list # If the top-level XML object in the file is a list
# then yield each element separately; otherwise, yield # then yield each element separately; otherwise, yield

View File

@ -1,22 +1,22 @@
from saucebrush.filters import Filter from saucebrush.filters import Filter
from saucebrush.utils import FallbackCounter
import collections import collections
import itertools
import math import math
def _average(values):
""" Calculate the average of a list of values.
:param values: an iterable of ints or floats to average def _average(values):
"""Calculate the average of a list of values.
:param values: an iterable of ints or floats to average
""" """
value_count = len(values) value_count = len(values)
if len(values) > 0: if len(values) > 0:
return sum(values) / float(value_count) return sum(values) / float(value_count)
def _median(values):
""" Calculate the median of a list of values.
:param values: an iterable of ints or floats to calculate def _median(values):
"""Calculate the median of a list of values.
:param values: an iterable of ints or floats to calculate
""" """
count = len(values) count = len(values)
@ -35,14 +35,15 @@ def _median(values):
else: else:
# even number of items, return average of middle two items # even number of items, return average of middle two items
mid = int(count / 2) mid = int(count / 2)
return sum(values[mid - 1:mid + 1]) / 2.0 return sum(values[mid - 1 : mid + 1]) / 2.0
def _stddev(values, population=False): def _stddev(values, population=False):
""" Calculate the standard deviation and variance of a list of values. """Calculate the standard deviation and variance of a list of values.
:param values: an iterable of ints or floats to calculate :param values: an iterable of ints or floats to calculate
:param population: True if values represents entire population, :param population: True if values represents entire population,
False if it is a sample of the population False if it is a sample of the population
""" """
avg = _average(values) avg = _average(values)
@ -54,11 +55,11 @@ def _stddev(values, population=False):
# the average of the squared differences # the average of the squared differences
variance = sum(diffsq) / float(count) variance = sum(diffsq) / float(count)
return (math.sqrt(variance), variance) # stddev is sqrt of variance return (math.sqrt(variance), variance) # stddev is sqrt of variance
class StatsFilter(Filter): class StatsFilter(Filter):
""" Base for all stats filters. """Base for all stats filters."""
"""
def __init__(self, field, test=None): def __init__(self, field, test=None):
self._field = field self._field = field
@ -70,20 +71,21 @@ class StatsFilter(Filter):
return record return record
def process_field(self, record): def process_field(self, record):
raise NotImplementedError('process_field not defined in ' + raise NotImplementedError(
self.__class__.__name__) "process_field not defined in " + self.__class__.__name__
)
def value(self): def value(self):
raise NotImplementedError('value not defined in ' + raise NotImplementedError("value not defined in " + self.__class__.__name__)
self.__class__.__name__)
class Sum(StatsFilter): class Sum(StatsFilter):
""" Calculate the sum of the values in a field. Field must contain either """Calculate the sum of the values in a field. Field must contain either
int or float values. int or float values.
""" """
def __init__(self, field, initial=0, **kwargs): def __init__(self, field, initial=0, **kwargs):
super(Sum, self).__init__(field, **kwargs) super().__init__(field, **kwargs)
self._value = initial self._value = initial
def process_field(self, item): def process_field(self, item):
@ -92,13 +94,14 @@ class Sum(StatsFilter):
def value(self): def value(self):
return self._value return self._value
class Average(StatsFilter): class Average(StatsFilter):
""" Calculate the average (mean) of the values in a field. Field must """Calculate the average (mean) of the values in a field. Field must
contain either int or float values. contain either int or float values.
""" """
def __init__(self, field, initial=0, **kwargs): def __init__(self, field, initial=0, **kwargs):
super(Average, self).__init__(field, **kwargs) super().__init__(field, **kwargs)
self._value = initial self._value = initial
self._count = 0 self._count = 0
@ -110,15 +113,16 @@ class Average(StatsFilter):
def value(self): def value(self):
return self._value / float(self._count) return self._value / float(self._count)
class Median(StatsFilter):
""" Calculate the median of the values in a field. Field must contain
either int or float values.
**This filter keeps a list of field values in memory.** class Median(StatsFilter):
"""Calculate the median of the values in a field. Field must contain
either int or float values.
**This filter keeps a list of field values in memory.**
""" """
def __init__(self, field, **kwargs): def __init__(self, field, **kwargs):
super(Median, self).__init__(field, **kwargs) super().__init__(field, **kwargs)
self._values = [] self._values = []
def process_field(self, item): def process_field(self, item):
@ -128,13 +132,14 @@ class Median(StatsFilter):
def value(self): def value(self):
return _median(self._values) return _median(self._values)
class MinMax(StatsFilter): class MinMax(StatsFilter):
""" Find the minimum and maximum values in a field. Field must contain """Find the minimum and maximum values in a field. Field must contain
either int or float values. either int or float values.
""" """
def __init__(self, field, **kwargs): def __init__(self, field, **kwargs):
super(MinMax, self).__init__(field, **kwargs) super().__init__(field, **kwargs)
self._max = None self._max = None
self._min = None self._min = None
@ -148,18 +153,19 @@ class MinMax(StatsFilter):
def value(self): def value(self):
return (self._min, self._max) return (self._min, self._max)
class StandardDeviation(StatsFilter):
""" Calculate the standard deviation of the values in a field. Calling
value() will return a standard deviation for the sample. Pass
population=True to value() for the standard deviation of the
population. Convenience methods are provided for average() and
median(). Field must contain either int or float values.
**This filter keeps a list of field values in memory.** class StandardDeviation(StatsFilter):
"""Calculate the standard deviation of the values in a field. Calling
value() will return a standard deviation for the sample. Pass
population=True to value() for the standard deviation of the
population. Convenience methods are provided for average() and
median(). Field must contain either int or float values.
**This filter keeps a list of field values in memory.**
""" """
def __init__(self, field, **kwargs): def __init__(self, field, **kwargs):
super(StandardDeviation, self).__init__(field, **kwargs) super().__init__(field, **kwargs)
self._values = [] self._values = []
def process_field(self, item): def process_field(self, item):
@ -173,31 +179,29 @@ class StandardDeviation(StatsFilter):
return _median(self._values) return _median(self._values)
def value(self, population=False): def value(self, population=False):
""" Return a tuple of (standard_deviation, variance). """Return a tuple of (standard_deviation, variance).
:param population: True if values represents entire population, :param population: True if values represents entire population,
False if values is a sample. Default: False False if values is a sample. Default: False
""" """
return _stddev(self._values, population) return _stddev(self._values, population)
class Histogram(StatsFilter):
""" Generate a basic histogram of the specified field. The value() method
returns a dict of value to occurance count mappings. The __str__ method
generates a basic and limited histogram useful for printing to the
command line. The label_length attribute determines the padding and
cut-off of the basic histogram labels.
**This filters maintains a dict of unique field values in memory.** class Histogram(StatsFilter):
"""Generate a basic histogram of the specified field. The value() method
returns a dict of value to occurance count mappings. The __str__ method
generates a basic and limited histogram useful for printing to the
command line. The label_length attribute determines the padding and
cut-off of the basic histogram labels.
**This filters maintains a dict of unique field values in memory.**
""" """
label_length = 6 label_length = 6
def __init__(self, field, **kwargs): def __init__(self, field, **kwargs):
super(Histogram, self).__init__(field, **kwargs) super().__init__(field, **kwargs)
if hasattr(collections, 'Counter'): self._counter = collections.Counter()
self._counter = collections.Counter()
else:
self._counter = FallbackCounter()
def process_field(self, item): def process_field(self, item):
self._counter[self.prep_field(item)] += 1 self._counter[self.prep_field(item)] += 1

View File

@ -1,45 +1,46 @@
import collections
import os import os
from urllib.request import urlopen
try:
from urllib.request import urlopen # attemp py3 first
except ImportError:
from urllib2 import urlopen # fallback to py2
""" """
General utilities used within saucebrush that may be useful elsewhere. General utilities used within saucebrush that may be useful elsewhere.
""" """
def get_django_model(dj_settings, app_label, model_name): def get_django_model(dj_settings, app_label, model_name):
""" """
Get a django model given a settings file, app label, and model name. Get a django model given a settings file, app label, and model name.
""" """
from django.conf import settings from django.conf import settings
if not settings.configured: if not settings.configured:
settings.configure(DATABASE_ENGINE=dj_settings.DATABASE_ENGINE, settings.configure(
DATABASE_NAME=dj_settings.DATABASE_NAME, DATABASE_ENGINE=dj_settings.DATABASE_ENGINE,
DATABASE_USER=dj_settings.DATABASE_USER, DATABASE_NAME=dj_settings.DATABASE_NAME,
DATABASE_PASSWORD=dj_settings.DATABASE_PASSWORD, DATABASE_USER=dj_settings.DATABASE_USER,
DATABASE_HOST=dj_settings.DATABASE_HOST, DATABASE_PASSWORD=dj_settings.DATABASE_PASSWORD,
INSTALLED_APPS=dj_settings.INSTALLED_APPS) DATABASE_HOST=dj_settings.DATABASE_HOST,
INSTALLED_APPS=dj_settings.INSTALLED_APPS,
)
from django.db.models import get_model from django.db.models import get_model
return get_model(app_label, model_name) return get_model(app_label, model_name)
def flatten(item, prefix='', separator='_', keys=None):
"""
Flatten nested dictionary into one with its keys concatenated together.
>>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}], def flatten(item, prefix="", separator="_", keys=None):
'f':{'g':{'h':6}}}) """
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6} Flatten nested dictionary into one with its keys concatenated together.
>>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}],
'f':{'g':{'h':6}}})
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
""" """
# update dictionaries recursively # update dictionaries recursively
if isinstance(item, dict): if isinstance(item, dict):
# don't prepend a leading _ # don't prepend a leading _
if prefix != '': if prefix != "":
prefix += separator prefix += separator
retval = {} retval = {}
for key, value in item.items(): for key, value in item.items():
@ -48,45 +49,30 @@ def flatten(item, prefix='', separator='_', keys=None):
else: else:
retval[prefix + key] = value retval[prefix + key] = value
return retval return retval
#elif isinstance(item, (tuple, list)): # elif isinstance(item, (tuple, list)):
# return {prefix: [flatten(i, prefix, separator, keys) for i in item]} # return {prefix: [flatten(i, prefix, separator, keys) for i in item]}
else: else:
return {prefix: item} return {prefix: item}
def str_or_list(obj): def str_or_list(obj):
if isinstance(obj, str): if isinstance(obj, str):
return [obj] return [obj]
else: else:
return obj return obj
# #
# utility classes # utility classes
# #
class FallbackCounter(collections.defaultdict):
""" Python 2.6 does not have collections.Counter.
This is class that does the basics of what we need from Counter.
"""
def __init__(self, *args, **kwargs): class Files:
super(FallbackCounter, self).__init__(int) """Iterate over multiple files as a single file. Pass the paths of the
files as arguments to the class constructor:
def most_common(n=None): for line in Files('/path/to/file/a', '/path/to/file/b'):
pass
l = sorted(self.items(),
cmp=lambda x,y: cmp(x[1], y[1]))
if n is not None:
l = l[:n]
return l
class Files(object):
""" Iterate over multiple files as a single file. Pass the paths of the
files as arguments to the class constructor:
for line in Files('/path/to/file/a', '/path/to/file/b'):
pass
""" """
def __init__(self, *args): def __init__(self, *args):
@ -111,10 +97,11 @@ class Files(object):
yield line yield line
f.close() f.close()
class RemoteFile(object):
""" Stream data from a remote file.
:param url: URL to remote file class RemoteFile:
"""Stream data from a remote file.
:param url: URL to remote file
""" """
def __init__(self, url): def __init__(self, url):
@ -126,21 +113,24 @@ class RemoteFile(object):
yield line.rstrip() yield line.rstrip()
resp.close() resp.close()
class ZippedFiles(object):
""" unpack a zipped collection of files on init.
Takes a string with file location or zipfile.ZipFile object class ZippedFiles:
"""unpack a zipped collection of files on init.
Best to wrap this in a Files() object, if the goal is to have a Takes a string with file location or zipfile.ZipFile object
linereader, as this only returns filelike objects.
if using a ZipFile object, make sure to set mode to 'a' or 'w' in order Best to wrap this in a Files() object, if the goal is to have a
to use the add() function. linereader, as this only returns filelike objects.
if using a ZipFile object, make sure to set mode to 'a' or 'w' in order
to use the add() function.
""" """
def __init__(self, zippedfile): def __init__(self, zippedfile):
import zipfile import zipfile
if type(zippedfile) == str: if type(zippedfile) == str:
self._zipfile = zipfile.ZipFile(zippedfile,'a') self._zipfile = zipfile.ZipFile(zippedfile, "a")
else: else:
self._zipfile = zippedfile self._zipfile = zippedfile
self.paths = self._zipfile.namelist() self.paths = self._zipfile.namelist()
@ -152,10 +142,10 @@ class ZippedFiles(object):
def add(self, path, dirname=None, arcname=None): def add(self, path, dirname=None, arcname=None):
path_base = os.path.basename(path) path_base = os.path.basename(path)
if dirname: if dirname:
arcname = os.path.join(dirname,path_base) arcname = os.path.join(dirname, path_base)
if not arcname: if not arcname:
arcname = path_base arcname = path_base
self._zipfile.write(path,arcname) self._zipfile.write(path, arcname)
self.paths.append(path) self.paths.append(path)
def filereader(self): def filereader(self):

107
tests/test_emitters.py Normal file
View File

@ -0,0 +1,107 @@
from contextlib import closing
from io import StringIO
import os
from saucebrush.emitters import (
DebugEmitter,
CSVEmitter,
CountEmitter,
SqliteEmitter,
SqlDumpEmitter,
)
def test_debug_emitter():
with closing(StringIO()) as output:
de = DebugEmitter(output)
list(de.attach([1, 2, 3]))
assert output.getvalue() == "1\n2\n3\n"
def test_count_emitter():
# values for test
values = [
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
]
with closing(StringIO()) as output:
# test without of parameter
ce = CountEmitter(every=10, outfile=output, format="%(count)s records\n")
list(ce.attach(values))
assert output.getvalue() == "10 records\n20 records\n"
ce.done()
assert output.getvalue() == "10 records\n20 records\n22 records\n"
with closing(StringIO()) as output:
# test with of parameter
ce = CountEmitter(every=10, outfile=output, of=len(values))
list(ce.attach(values))
assert output.getvalue() == "10 of 22\n20 of 22\n"
ce.done()
assert output.getvalue() == "10 of 22\n20 of 22\n22 of 22\n"
def test_csv_emitter():
io = StringIO() # if Python 3.x then use StringIO
with closing(io) as output:
ce = CSVEmitter(output, ("x", "y", "z"))
list(ce.attach([{"x": 1, "y": 2, "z": 3}, {"x": 5, "y": 5, "z": 5}]))
assert output.getvalue() == "x,y,z\r\n1,2,3\r\n5,5,5\r\n"
def test_sqlite_emitter():
import sqlite3
import tempfile
with closing(tempfile.NamedTemporaryFile(suffix=".db")) as f:
db_path = f.name
sle = SqliteEmitter(db_path, "testtable", fieldnames=("a", "b", "c"))
list(sle.attach([{"a": "1", "b": "2", "c": "3"}]))
sle.done()
with closing(sqlite3.connect(db_path)) as conn:
cur = conn.cursor()
cur.execute("""SELECT a, b, c FROM testtable""")
results = cur.fetchall()
os.unlink(db_path)
assert results == [("1", "2", "3")]
def test_sql_dump_emitter():
with closing(StringIO()) as bffr:
sde = SqlDumpEmitter(bffr, "testtable", ("a", "b"))
list(sde.attach([{"a": 1, "b": "2"}]))
sde.done()
assert bffr.getvalue() == "INSERT INTO `testtable` (`a`,`b`) VALUES (1,'2');\n"

355
tests/test_filters.py Normal file
View File

@ -0,0 +1,355 @@
import unittest
import types
from saucebrush.filters import (
Filter,
YieldFilter,
FieldFilter,
SubrecordFilter,
ConditionalPathFilter,
ConditionalFilter,
FieldModifier,
FieldKeeper,
FieldRemover,
FieldMerger,
FieldAdder,
FieldCopier,
FieldRenamer,
Unique,
)
class DummyRecipe:
rejected_record = None
rejected_msg = None
def reject_record(self, record, msg):
self.rejected_record = record
self.rejected_msg = msg
class Doubler(Filter):
def process_record(self, record):
return record * 2
class OddRemover(Filter):
def process_record(self, record):
if record % 2 == 0:
return record
else:
return None # explicitly return None
class ListFlattener(YieldFilter):
def process_record(self, record):
for item in record:
yield item
class FieldDoubler(FieldFilter):
def process_field(self, item):
return item * 2
class NonModifyingFieldDoubler(Filter):
def __init__(self, key):
self.key = key
def process_record(self, record):
record = dict(record)
record[self.key] *= 2
return record
class ConditionalOddRemover(ConditionalFilter):
def test_record(self, record):
# return True for even values
return record % 2 == 0
class FilterTestCase(unittest.TestCase):
def _simple_data(self):
return [
{"a": 1, "b": 2, "c": 3},
{"a": 5, "b": 5, "c": 5},
{"a": 1, "b": 10, "c": 100},
]
def assert_filter_result(self, filter_obj, expected_data):
result = filter_obj.attach(self._simple_data())
self.assertEqual(list(result), expected_data)
def test_reject_record(self):
recipe = DummyRecipe()
f = Doubler()
result = f.attach([1, 2, 3], recipe=recipe)
# next has to be called for attach to take effect
next(result)
f.reject_record("bad", "this one was bad")
# ensure that the rejection propagated to the recipe
self.assertEqual("bad", recipe.rejected_record)
self.assertEqual("this one was bad", recipe.rejected_msg)
def test_simple_filter(self):
df = Doubler()
result = df.attach([1, 2, 3])
# ensure we got a generator that yields 2,4,6
self.assertEqual(type(result), types.GeneratorType)
self.assertEqual(list(result), [2, 4, 6])
def test_simple_filter_return_none(self):
cf = OddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEqual(list(result), [0, 2, 4, 6, 8])
def test_simple_yield_filter(self):
lf = ListFlattener()
result = lf.attach([[1], [2, 3], [4, 5, 6]])
# ensure we got a generator that yields 1,2,3,4,5,6
self.assertEqual(type(result), types.GeneratorType)
self.assertEqual(list(result), [1, 2, 3, 4, 5, 6])
def test_simple_field_filter(self):
ff = FieldDoubler(["a", "c"])
# check against expected data
expected_data = [
{"a": 2, "b": 2, "c": 6},
{"a": 10, "b": 5, "c": 10},
{"a": 2, "b": 10, "c": 200},
]
self.assert_filter_result(ff, expected_data)
def test_conditional_filter(self):
cf = ConditionalOddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEqual(list(result), [0, 2, 4, 6, 8])
# Tests for Subrecord
def test_subrecord_filter_list(self):
data = [
{"a": [{"b": 2}, {"b": 4}]},
{"a": [{"b": 5}]},
{"a": [{"b": 8}, {"b": 2}, {"b": 1}]},
]
expected = [
{"a": [{"b": 4}, {"b": 8}]},
{"a": [{"b": 10}]},
{"a": [{"b": 16}, {"b": 4}, {"b": 2}]},
]
sf = SubrecordFilter("a", NonModifyingFieldDoubler("b"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_deep(self):
data = [
{"a": {"d": [{"b": 2}, {"b": 4}]}},
{"a": {"d": [{"b": 5}]}},
{"a": {"d": [{"b": 8}, {"b": 2}, {"b": 1}]}},
]
expected = [
{"a": {"d": [{"b": 4}, {"b": 8}]}},
{"a": {"d": [{"b": 10}]}},
{"a": {"d": [{"b": 16}, {"b": 4}, {"b": 2}]}},
]
sf = SubrecordFilter("a.d", NonModifyingFieldDoubler("b"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_nonlist(self):
data = [
{"a": {"b": {"c": 1}}},
{"a": {"b": {"c": 2}}},
{"a": {"b": {"c": 3}}},
]
expected = [
{"a": {"b": {"c": 2}}},
{"a": {"b": {"c": 4}}},
{"a": {"b": {"c": 6}}},
]
sf = SubrecordFilter("a.b", NonModifyingFieldDoubler("c"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_list_in_path(self):
data = [
{"a": [{"b": {"c": 5}}, {"b": {"c": 6}}]},
{"a": [{"b": {"c": 1}}, {"b": {"c": 2}}, {"b": {"c": 3}}]},
{"a": [{"b": {"c": 2}}]},
]
expected = [
{"a": [{"b": {"c": 10}}, {"b": {"c": 12}}]},
{"a": [{"b": {"c": 2}}, {"b": {"c": 4}}, {"b": {"c": 6}}]},
{"a": [{"b": {"c": 4}}]},
]
sf = SubrecordFilter("a.b", NonModifyingFieldDoubler("c"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_conditional_path(self):
predicate = lambda r: r["a"] == 1 # noqa
# double b if a == 1, otherwise double c
cpf = ConditionalPathFilter(predicate, FieldDoubler("b"), FieldDoubler("c"))
expected_data = [
{"a": 1, "b": 4, "c": 3},
{"a": 5, "b": 5, "c": 10},
{"a": 1, "b": 20, "c": 100},
]
self.assert_filter_result(cpf, expected_data)
# Tests for Generic Filters
def test_field_modifier(self):
# another version of FieldDoubler
fm = FieldModifier(["a", "c"], lambda x: x * 2)
# check against expected data
expected_data = [
{"a": 2, "b": 2, "c": 6},
{"a": 10, "b": 5, "c": 10},
{"a": 2, "b": 10, "c": 200},
]
self.assert_filter_result(fm, expected_data)
def test_field_keeper(self):
fk = FieldKeeper(["c"])
# check against expected results
expected_data = [{"c": 3}, {"c": 5}, {"c": 100}]
self.assert_filter_result(fk, expected_data)
def test_field_remover(self):
fr = FieldRemover(["a", "b"])
# check against expected results
expected_data = [{"c": 3}, {"c": 5}, {"c": 100}]
self.assert_filter_result(fr, expected_data)
def test_field_merger(self):
fm = FieldMerger({"sum": ("a", "b", "c")}, lambda x, y, z: x + y + z)
# check against expected results
expected_data = [{"sum": 6}, {"sum": 15}, {"sum": 111}]
self.assert_filter_result(fm, expected_data)
def test_field_merger_keep_fields(self):
fm = FieldMerger(
{"sum": ("a", "b", "c")}, lambda x, y, z: x + y + z, keep_fields=True
)
# check against expected results
expected_data = [
{"a": 1, "b": 2, "c": 3, "sum": 6},
{"a": 5, "b": 5, "c": 5, "sum": 15},
{"a": 1, "b": 10, "c": 100, "sum": 111},
]
self.assert_filter_result(fm, expected_data)
def test_field_adder_scalar(self):
fa = FieldAdder("x", 7)
expected_data = [
{"a": 1, "b": 2, "c": 3, "x": 7},
{"a": 5, "b": 5, "c": 5, "x": 7},
{"a": 1, "b": 10, "c": 100, "x": 7},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_callable(self):
fa = FieldAdder("x", lambda: 7)
expected_data = [
{"a": 1, "b": 2, "c": 3, "x": 7},
{"a": 5, "b": 5, "c": 5, "x": 7},
{"a": 1, "b": 10, "c": 100, "x": 7},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_iterable(self):
fa = FieldAdder("x", [1, 2, 3])
expected_data = [
{"a": 1, "b": 2, "c": 3, "x": 1},
{"a": 5, "b": 5, "c": 5, "x": 2},
{"a": 1, "b": 10, "c": 100, "x": 3},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_replace(self):
fa = FieldAdder("b", lambda: 7)
expected_data = [
{"a": 1, "b": 7, "c": 3},
{"a": 5, "b": 7, "c": 5},
{"a": 1, "b": 7, "c": 100},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_no_replace(self):
fa = FieldAdder("b", lambda: 7, replace=False)
expected_data = [
{"a": 1, "b": 2, "c": 3},
{"a": 5, "b": 5, "c": 5},
{"a": 1, "b": 10, "c": 100},
]
self.assert_filter_result(fa, expected_data)
def test_field_copier(self):
fc = FieldCopier({"a2": "a", "b2": "b"})
expected_data = [
{"a": 1, "b": 2, "c": 3, "a2": 1, "b2": 2},
{"a": 5, "b": 5, "c": 5, "a2": 5, "b2": 5},
{"a": 1, "b": 10, "c": 100, "a2": 1, "b2": 10},
]
self.assert_filter_result(fc, expected_data)
def test_field_renamer(self):
fr = FieldRenamer({"x": "a", "y": "b"})
expected_data = [
{"x": 1, "y": 2, "c": 3},
{"x": 5, "y": 5, "c": 5},
{"x": 1, "y": 10, "c": 100},
]
self.assert_filter_result(fr, expected_data)
# TODO: splitter & flattner tests?
def test_unique_filter(self):
u = Unique()
in_data = [{"a": 77}, {"a": 33}, {"a": 77}]
expected_data = [{"a": 77}, {"a": 33}]
result = u.attach(in_data)
self.assertEqual(list(result), expected_data)
# TODO: unicode & string filter tests
if __name__ == "__main__":
unittest.main()

49
tests/test_recipes.py Normal file
View File

@ -0,0 +1,49 @@
import pytest
from saucebrush import Recipe, run_recipe, SaucebrushError, OvercookedError
from saucebrush.filters import Filter
class Raiser(Filter):
def process_record(self, record):
raise Exception("bad record")
class Saver(Filter):
def __init__(self):
self.saved = []
def process_record(self, record):
self.saved.append(record)
return record
def test_error_stream():
saver = Saver()
recipe = Recipe(Raiser(), error_stream=saver)
recipe.run([{"a": 1}, {"b": 2}])
recipe.done()
assert saver.saved[0]["record"] == {"a": 1}
assert saver.saved[1]["record"] == {"b": 2}
# Must pass either a Recipe, a Filter or an iterable of Filters
# as the error_stream argument
assert pytest.raises(SaucebrushError, Recipe, error_stream=5)
def test_run_recipe():
saver = Saver()
run_recipe([1, 2], saver)
assert saver.saved == [1, 2]
def test_done():
saver = Saver()
recipe = Recipe(saver)
recipe.run([1])
recipe.done()
assert pytest.raises(OvercookedError, recipe.run, [2])
assert pytest.raises(OvercookedError, recipe.done)
assert saver.saved == [1]

90
tests/test_sources.py Normal file
View File

@ -0,0 +1,90 @@
from io import StringIO
from saucebrush.sources import (
CSVSource,
FixedWidthFileSource,
HtmlTableSource,
JSONSource,
)
def _get_csv():
data = """a,b,c
1,2,3
5,5,5
1,10,100"""
return StringIO(data)
def test_csv_source_basic():
source = CSVSource(_get_csv())
expected_data = [
{"a": "1", "b": "2", "c": "3"},
{"a": "5", "b": "5", "c": "5"},
{"a": "1", "b": "10", "c": "100"},
]
assert list(source) ==expected_data
def test_csv_source_fieldnames():
source = CSVSource(_get_csv(), ["x", "y", "z"])
expected_data = [
{"x": "a", "y": "b", "z": "c"},
{"x": "1", "y": "2", "z": "3"},
{"x": "5", "y": "5", "z": "5"},
{"x": "1", "y": "10", "z": "100"},
]
assert list(source) == expected_data
def test_csv_source_skiprows():
source = CSVSource(_get_csv(), skiprows=1)
expected_data = [
{"a": "5", "b": "5", "c": "5"},
{"a": "1", "b": "10", "c": "100"},
]
assert list(source) == expected_data
def test_fixed_width_source():
data = StringIO("JamesNovember 3 1986\nTim September151999")
fields = (("name", 5), ("month", 9), ("day", 2), ("year", 4))
source = FixedWidthFileSource(data, fields)
expected_data = [
{"name": "James", "month": "November", "day": "3", "year": "1986"},
{"name": "Tim", "month": "September", "day": "15", "year": "1999"},
]
assert list(source) == expected_data
def test_json_source():
content = StringIO("""[{"a": 1, "b": "2", "c": 3}]""")
js = JSONSource(content)
assert list(js) == [{"a": 1, "b": "2", "c": 3}]
def test_html_table_source():
content = StringIO(
"""
<html>
<table id="thetable">
<tr>
<th>a</th>
<th>b</th>
<th>c</th>
</tr>
<tr>
<td>1</td>
<td>2</td>
<td>3</td>
</tr>
</table>
</html>
"""
)
hts = HtmlTableSource(content, "thetable")
assert list(hts) == [{"a": "1", "b": "2", "c": "3"}]

55
tests/test_stats.py Normal file
View File

@ -0,0 +1,55 @@
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram
def _simple_data():
return [
{"a": 1, "b": 2, "c": 3},
{"a": 5, "b": 5, "c": 5},
{"a": 1, "b": 10, "c": 100},
]
def test_sum():
fltr = Sum("b")
list(fltr.attach(_simple_data()))
assert fltr.value() == 17
def test_average():
fltr = Average("c")
list(fltr.attach(_simple_data()))
assert fltr.value() == 36.0
def test_median():
# odd number of values
fltr = Median("a")
list(fltr.attach(_simple_data()))
assert fltr.value() == 1
# even number of values
fltr = Median("a")
list(fltr.attach(_simple_data()[:2]))
assert fltr.value() == 3
def test_minmax():
fltr = MinMax("b")
list(fltr.attach(_simple_data()))
assert fltr.value() == (2, 10)
def test_standard_deviation():
fltr = StandardDeviation("c")
list(fltr.attach(_simple_data()))
assert fltr.average() == 36.0
assert fltr.median() == 5
assert fltr.value() == (55.4346462061408, 3073.0)
assert fltr.value(True) == (45.2621990922521, 2048.6666666666665)
def test_histogram():
fltr = Histogram("a")
fltr.label_length = 1
list(fltr.attach(_simple_data()))
assert str(fltr) == "\n1 **\n5 *\n"