diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..1b92459 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +github: [jamesturk] diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..bc4a441 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,17 @@ +--- +name: Bug report +about: Create a report to help us improve +title: "" +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**Environment** +Please provide output of `python -V` & `spatula --version`, as well as what operating system you're using, and any other details: + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..5efb987 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context about the feature request here. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..827453e --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,36 @@ +name: Test & Lint + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + max-parallel: 4 + matrix: + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + + steps: + # Python & dependency installation + - uses: actions/checkout@v3 + - name: setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: install Poetry + uses: snok/install-poetry@v1.2.1 + - name: set poetry config path + run: poetry config virtualenvs.path ~/.virtualenvs + - name: install dependencies + run: poetry install + + # - name: lint with mypy + # run: poetry run mypy src + - name: lint with flake8 + run: poetry run flake8 --show-source --statistics --ignore=E203,E501,W503 src + - name: pytest + run: poetry run pytest diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index c16aeb2..0000000 --- a/.travis.yml +++ /dev/null @@ -1,9 +0,0 @@ -language: python -python: - - "2.7" - - "3.5" -install: pip install nose -script: nosetests -notifications: - email: - - james.p.turk@gmail.com diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/examples/fec_electronic.py b/examples/fec_electronic.py index d482eef..01c59ec 100644 --- a/examples/fec_electronic.py +++ b/examples/fec_electronic.py @@ -1,7 +1,7 @@ import re import exceptions -class FECSource(object): +class FECSource: SPLIT_CHAR = '\x1c' FORM_FIELDS = { diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..e9f3aaa --- /dev/null +++ b/poetry.lock @@ -0,0 +1,395 @@ +[[package]] +name = "attrs" +version = "22.1.0" +description = "Classes Without Boilerplate" +category = "dev" +optional = false +python-versions = ">=3.5" + +[package.extras] +dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy (>=0.900,!=0.940)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "sphinx", "sphinx-notfound-page", "zope.interface"] +docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] +tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"] +tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"] + +[[package]] +name = "black" +version = "22.10.0" +description = "The uncompromising code formatter." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +category = "dev" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" + +[[package]] +name = "cssselect" +version = "1.2.0" +description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" +category = "main" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "exceptiongroup" +version = "1.0.1" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "flake8" +version = "5.0.4" +description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" +optional = false +python-versions = ">=3.6.1" + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.9.0,<2.10.0" +pyflakes = ">=2.5.0,<2.6.0" + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "lxml" +version = "4.9.1" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=0.29.7)"] + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "mypy-extensions" +version = "0.4.3" +description = "Experimental type system extensions for programs checked with the mypy typechecker." +category = "dev" +optional = false +python-versions = "*" + +[[package]] +name = "packaging" +version = "21.3" +description = "Core utilities for Python packages" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.dependencies] +pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" + +[[package]] +name = "pathspec" +version = "0.10.1" +description = "Utility library for gitignore style pattern matching of file paths." +category = "dev" +optional = false +python-versions = ">=3.7" + +[[package]] +name = "platformdirs" +version = "2.5.3" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +docs = ["furo (>=2022.9.29)", "proselint (>=0.13)", "sphinx (>=5.3)", "sphinx-autodoc-typehints (>=1.19.4)"] +test = ["appdirs (==1.4.4)", "pytest (>=7.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +category = "dev" +optional = false +python-versions = ">=3.6" + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pycodestyle" +version = "2.9.1" +description = "Python style guide checker" +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "pyflakes" +version = "2.5.0" +description = "passive checker of Python programs" +category = "dev" +optional = false +python-versions = ">=3.6" + +[[package]] +name = "pyparsing" +version = "3.0.9" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +category = "dev" +optional = false +python-versions = ">=3.6.8" + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "pytest" +version = "7.2.0" +description = "pytest: simple powerful testing with Python" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +category = "dev" +optional = false +python-versions = ">=3.7" + +[metadata] +lock-version = "1.1" +python-versions = "^3.10" +content-hash = "765977e700b56e9b852f6ca6f5d54e2c1343b3a07b9220e83ef969a277f67866" + +[metadata.files] +attrs = [ + {file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"}, + {file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"}, +] +black = [ + {file = "black-22.10.0-1fixedarch-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5cc42ca67989e9c3cf859e84c2bf014f6633db63d1cbdf8fdb666dcd9e77e3fa"}, + {file = "black-22.10.0-1fixedarch-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5d8f74030e67087b219b032aa33a919fae8806d49c867846bfacde57f43972ef"}, + {file = "black-22.10.0-1fixedarch-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:197df8509263b0b8614e1df1756b1dd41be6738eed2ba9e9769f3880c2b9d7b6"}, + {file = "black-22.10.0-1fixedarch-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:2644b5d63633702bc2c5f3754b1b475378fbbfb481f62319388235d0cd104c2d"}, + {file = "black-22.10.0-1fixedarch-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:e41a86c6c650bcecc6633ee3180d80a025db041a8e2398dcc059b3afa8382cd4"}, + {file = "black-22.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2039230db3c6c639bd84efe3292ec7b06e9214a2992cd9beb293d639c6402edb"}, + {file = "black-22.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ff67aec0a47c424bc99b71005202045dc09270da44a27848d534600ac64fc7"}, + {file = "black-22.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:819dc789f4498ecc91438a7de64427c73b45035e2e3680c92e18795a839ebb66"}, + {file = "black-22.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b9b29da4f564ba8787c119f37d174f2b69cdfdf9015b7d8c5c16121ddc054ae"}, + {file = "black-22.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b49776299fece66bffaafe357d929ca9451450f5466e997a7285ab0fe28e3b"}, + {file = "black-22.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:21199526696b8f09c3997e2b4db8d0b108d801a348414264d2eb8eb2532e540d"}, + {file = "black-22.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e464456d24e23d11fced2bc8c47ef66d471f845c7b7a42f3bd77bf3d1789650"}, + {file = "black-22.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9311e99228ae10023300ecac05be5a296f60d2fd10fff31cf5c1fa4ca4b1988d"}, + {file = "black-22.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fba8a281e570adafb79f7755ac8721b6cf1bbf691186a287e990c7929c7692ff"}, + {file = "black-22.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:915ace4ff03fdfff953962fa672d44be269deb2eaf88499a0f8805221bc68c87"}, + {file = "black-22.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:444ebfb4e441254e87bad00c661fe32df9969b2bf224373a448d8aca2132b395"}, + {file = "black-22.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:974308c58d057a651d182208a484ce80a26dac0caef2895836a92dd6ebd725e0"}, + {file = "black-22.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72ef3925f30e12a184889aac03d77d031056860ccae8a1e519f6cbb742736383"}, + {file = "black-22.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:432247333090c8c5366e69627ccb363bc58514ae3e63f7fc75c54b1ea80fa7de"}, + {file = "black-22.10.0-py3-none-any.whl", hash = "sha256:c957b2b4ea88587b46cf49d1dc17681c1e672864fd7af32fc1e9664d572b3458"}, + {file = "black-22.10.0.tar.gz", hash = "sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1"}, +] +click = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] +colorama = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] +cssselect = [ + {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"}, + {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"}, +] +exceptiongroup = [ + {file = "exceptiongroup-1.0.1-py3-none-any.whl", hash = "sha256:4d6c0aa6dd825810941c792f53d7b8d71da26f5e5f84f20f9508e8f2d33b140a"}, + {file = "exceptiongroup-1.0.1.tar.gz", hash = "sha256:73866f7f842ede6cb1daa42c4af078e2035e5f7607f0e2c762cc51bb31bbe7b2"}, +] +flake8 = [ + {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, + {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, +] +iniconfig = [ + {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, + {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, +] +lxml = [ + {file = "lxml-4.9.1-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed"}, + {file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc"}, + {file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc"}, + {file = "lxml-4.9.1-cp27-cp27m-win32.whl", hash = "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3"}, + {file = "lxml-4.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627"}, + {file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84"}, + {file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837"}, + {file = "lxml-4.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad"}, + {file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5"}, + {file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8"}, + {file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8"}, + {file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d"}, + {file = "lxml-4.9.1-cp310-cp310-win32.whl", hash = "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7"}, + {file = "lxml-4.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b"}, + {file = "lxml-4.9.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d"}, + {file = "lxml-4.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3"}, + {file = "lxml-4.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29"}, + {file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d"}, + {file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318"}, + {file = "lxml-4.9.1-cp35-cp35m-win32.whl", hash = "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7"}, + {file = "lxml-4.9.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4"}, + {file = "lxml-4.9.1-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b"}, + {file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf"}, + {file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3"}, + {file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391"}, + {file = "lxml-4.9.1-cp36-cp36m-win32.whl", hash = "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e"}, + {file = "lxml-4.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7"}, + {file = "lxml-4.9.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3"}, + {file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca"}, + {file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785"}, + {file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785"}, + {file = "lxml-4.9.1-cp37-cp37m-win32.whl", hash = "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a"}, + {file = "lxml-4.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e"}, + {file = "lxml-4.9.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130"}, + {file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715"}, + {file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036"}, + {file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387"}, + {file = "lxml-4.9.1-cp38-cp38-win32.whl", hash = "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94"}, + {file = "lxml-4.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345"}, + {file = "lxml-4.9.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91"}, + {file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000"}, + {file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25"}, + {file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd"}, + {file = "lxml-4.9.1-cp39-cp39-win32.whl", hash = "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb"}, + {file = "lxml-4.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d"}, + {file = "lxml-4.9.1-pp37-pypy37_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c"}, + {file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b"}, + {file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc"}, + {file = "lxml-4.9.1-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b"}, + {file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2"}, + {file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73"}, + {file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c"}, + {file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9"}, + {file = "lxml-4.9.1.tar.gz", hash = "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"}, +] +mccabe = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] +mypy-extensions = [ + {file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"}, + {file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"}, +] +packaging = [ + {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, + {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, +] +pathspec = [ + {file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"}, + {file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"}, +] +platformdirs = [ + {file = "platformdirs-2.5.3-py3-none-any.whl", hash = "sha256:0cb405749187a194f444c25c82ef7225232f11564721eabffc6ec70df83b11cb"}, + {file = "platformdirs-2.5.3.tar.gz", hash = "sha256:6e52c21afff35cb659c6e52d8b4d61b9bd544557180440538f255d9382c8cbe0"}, +] +pluggy = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] +pycodestyle = [ + {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, + {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, +] +pyflakes = [ + {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, + {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, +] +pyparsing = [ + {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, + {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, +] +pytest = [ + {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, + {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"}, +] +tomli = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ce52960 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[tool.poetry] +name = "saucebrush" +version = "0.6.0" +description = "" +authors = ["James Turk "] +license = "MIT" +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.10" +lxml = "^4.9.1" +cssselect = "^1.2.0" + + +[tool.poetry.group.dev.dependencies] +pytest = "^7.2.0" +flake8 = "^5.0.4" +black = "^22.10.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/saucebrush/tests/__init__.py b/saucebrush/tests/__init__.py deleted file mode 100644 index 4297ebb..0000000 --- a/saucebrush/tests/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest -from saucebrush.tests.filters import FilterTestCase -from saucebrush.tests.sources import SourceTestCase -from saucebrush.tests.emitters import EmitterTestCase -from saucebrush.tests.recipes import RecipeTestCase -from saucebrush.tests.stats import StatsTestCase - -filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase) -source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase) -emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase) -recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase) -stats_suite = unittest.TestLoader().loadTestsFromTestCase(StatsTestCase) - -if __name__ == '__main__': - unittest.main() diff --git a/saucebrush/tests/emitters.py b/saucebrush/tests/emitters.py deleted file mode 100644 index 606178e..0000000 --- a/saucebrush/tests/emitters.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals -from contextlib import closing -from io import StringIO -import os -import unittest - -from saucebrush.emitters import ( - DebugEmitter, CSVEmitter, CountEmitter, SqliteEmitter, SqlDumpEmitter) - -class EmitterTestCase(unittest.TestCase): - - def test_debug_emitter(self): - with closing(StringIO()) as output: - de = DebugEmitter(output) - list(de.attach([1,2,3])) - self.assertEqual(output.getvalue(), '1\n2\n3\n') - - def test_count_emitter(self): - - # values for test - values = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22] - - with closing(StringIO()) as output: - - # test without of parameter - ce = CountEmitter(every=10, outfile=output, format="%(count)s records\n") - list(ce.attach(values)) - self.assertEqual(output.getvalue(), '10 records\n20 records\n') - ce.done() - self.assertEqual(output.getvalue(), '10 records\n20 records\n22 records\n') - - with closing(StringIO()) as output: - - # test with of parameter - ce = CountEmitter(every=10, outfile=output, of=len(values)) - list(ce.attach(values)) - self.assertEqual(output.getvalue(), '10 of 22\n20 of 22\n') - ce.done() - self.assertEqual(output.getvalue(), '10 of 22\n20 of 22\n22 of 22\n') - - def test_csv_emitter(self): - - try: - import cStringIO # if Python 2.x then use old cStringIO - io = cStringIO.StringIO() - except: - io = StringIO() # if Python 3.x then use StringIO - - with closing(io) as output: - ce = CSVEmitter(output, ('x','y','z')) - list(ce.attach([{'x':1, 'y':2, 'z':3}, {'x':5, 'y':5, 'z':5}])) - self.assertEqual(output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n') - - def test_sqlite_emitter(self): - - import sqlite3, tempfile - - with closing(tempfile.NamedTemporaryFile(suffix='.db')) as f: - db_path = f.name - - sle = SqliteEmitter(db_path, 'testtable', fieldnames=('a','b','c')) - list(sle.attach([{'a': '1', 'b': '2', 'c': '3'}])) - sle.done() - - with closing(sqlite3.connect(db_path)) as conn: - cur = conn.cursor() - cur.execute("""SELECT a, b, c FROM testtable""") - results = cur.fetchall() - - os.unlink(db_path) - - self.assertEqual(results, [('1', '2', '3')]) - - def test_sql_dump_emitter(self): - - with closing(StringIO()) as bffr: - - sde = SqlDumpEmitter(bffr, 'testtable', ('a', 'b')) - list(sde.attach([{'a': 1, 'b': '2'}])) - sde.done() - - self.assertEqual(bffr.getvalue(), "INSERT INTO `testtable` (`a`,`b`) VALUES (1,'2');\n") - - -if __name__ == '__main__': - unittest.main() diff --git a/saucebrush/tests/filters.py b/saucebrush/tests/filters.py deleted file mode 100644 index 04ce0c4..0000000 --- a/saucebrush/tests/filters.py +++ /dev/null @@ -1,304 +0,0 @@ -import unittest -import operator -import types -from saucebrush.filters import (Filter, YieldFilter, FieldFilter, - SubrecordFilter, ConditionalPathFilter, - ConditionalFilter, FieldModifier, FieldKeeper, - FieldRemover, FieldMerger, FieldAdder, - FieldCopier, FieldRenamer, Unique) - -class DummyRecipe(object): - rejected_record = None - rejected_msg = None - def reject_record(self, record, msg): - self.rejected_record = record - self.rejected_msg = msg - -class Doubler(Filter): - def process_record(self, record): - return record*2 - -class OddRemover(Filter): - def process_record(self, record): - if record % 2 == 0: - return record - else: - return None # explicitly return None - -class ListFlattener(YieldFilter): - def process_record(self, record): - for item in record: - yield item - -class FieldDoubler(FieldFilter): - def process_field(self, item): - return item*2 - -class NonModifyingFieldDoubler(Filter): - def __init__(self, key): - self.key = key - - def process_record(self, record): - record = dict(record) - record[self.key] *= 2 - return record - -class ConditionalOddRemover(ConditionalFilter): - def test_record(self, record): - # return True for even values - return record % 2 == 0 - -class FilterTestCase(unittest.TestCase): - - def _simple_data(self): - return [{'a':1, 'b':2, 'c':3}, - {'a':5, 'b':5, 'c':5}, - {'a':1, 'b':10, 'c':100}] - - def assert_filter_result(self, filter_obj, expected_data): - result = filter_obj.attach(self._simple_data()) - self.assertEqual(list(result), expected_data) - - def test_reject_record(self): - recipe = DummyRecipe() - f = Doubler() - result = f.attach([1,2,3], recipe=recipe) - # next has to be called for attach to take effect - next(result) - f.reject_record('bad', 'this one was bad') - - # ensure that the rejection propagated to the recipe - self.assertEqual('bad', recipe.rejected_record) - self.assertEqual('this one was bad', recipe.rejected_msg) - - def test_simple_filter(self): - df = Doubler() - result = df.attach([1,2,3]) - - # ensure we got a generator that yields 2,4,6 - self.assertEqual(type(result), types.GeneratorType) - self.assertEqual(list(result), [2,4,6]) - - def test_simple_filter_return_none(self): - cf = OddRemover() - result = cf.attach(range(10)) - - # ensure only even numbers remain - self.assertEqual(list(result), [0,2,4,6,8]) - - def test_simple_yield_filter(self): - lf = ListFlattener() - result = lf.attach([[1],[2,3],[4,5,6]]) - - # ensure we got a generator that yields 1,2,3,4,5,6 - self.assertEqual(type(result), types.GeneratorType) - self.assertEqual(list(result), [1,2,3,4,5,6]) - - def test_simple_field_filter(self): - ff = FieldDoubler(['a', 'c']) - - # check against expected data - expected_data = [{'a':2, 'b':2, 'c':6}, - {'a':10, 'b':5, 'c':10}, - {'a':2, 'b':10, 'c':200}] - self.assert_filter_result(ff, expected_data) - - def test_conditional_filter(self): - cf = ConditionalOddRemover() - result = cf.attach(range(10)) - - # ensure only even numbers remain - self.assertEqual(list(result), [0,2,4,6,8]) - - ### Tests for Subrecord - - def test_subrecord_filter_list(self): - data = [{'a': [{'b': 2}, {'b': 4}]}, - {'a': [{'b': 5}]}, - {'a': [{'b': 8}, {'b':2}, {'b':1}]}] - - expected = [{'a': [{'b': 4}, {'b': 8}]}, - {'a': [{'b': 10}]}, - {'a': [{'b': 16}, {'b':4}, {'b':2}]}] - - sf = SubrecordFilter('a', NonModifyingFieldDoubler('b')) - result = sf.attach(data) - - self.assertEqual(list(result), expected) - - def test_subrecord_filter_deep(self): - data = [{'a': {'d':[{'b': 2}, {'b': 4}]}}, - {'a': {'d':[{'b': 5}]}}, - {'a': {'d':[{'b': 8}, {'b':2}, {'b':1}]}}] - - expected = [{'a': {'d':[{'b': 4}, {'b': 8}]}}, - {'a': {'d':[{'b': 10}]}}, - {'a': {'d':[{'b': 16}, {'b':4}, {'b':2}]}}] - - sf = SubrecordFilter('a.d', NonModifyingFieldDoubler('b')) - result = sf.attach(data) - - self.assertEqual(list(result), expected) - - def test_subrecord_filter_nonlist(self): - data = [ - {'a':{'b':{'c':1}}}, - {'a':{'b':{'c':2}}}, - {'a':{'b':{'c':3}}}, - ] - - expected = [ - {'a':{'b':{'c':2}}}, - {'a':{'b':{'c':4}}}, - {'a':{'b':{'c':6}}}, - ] - - sf = SubrecordFilter('a.b', NonModifyingFieldDoubler('c')) - result = sf.attach(data) - - self.assertEqual(list(result), expected) - - def test_subrecord_filter_list_in_path(self): - data = [ - {'a': [{'b': {'c': 5}}, {'b': {'c': 6}}]}, - {'a': [{'b': {'c': 1}}, {'b': {'c': 2}}, {'b': {'c': 3}}]}, - {'a': [{'b': {'c': 2}} ]} - ] - - expected = [ - {'a': [{'b': {'c': 10}}, {'b': {'c': 12}}]}, - {'a': [{'b': {'c': 2}}, {'b': {'c': 4}}, {'b': {'c': 6}}]}, - {'a': [{'b': {'c': 4}} ]} - ] - - sf = SubrecordFilter('a.b', NonModifyingFieldDoubler('c')) - result = sf.attach(data) - - self.assertEqual(list(result), expected) - - def test_conditional_path(self): - - predicate = lambda r: r['a'] == 1 - - # double b if a == 1, otherwise double c - cpf = ConditionalPathFilter(predicate, FieldDoubler('b'), - FieldDoubler('c')) - expected_data = [{'a':1, 'b':4, 'c':3}, - {'a':5, 'b':5, 'c':10}, - {'a':1, 'b':20, 'c':100}] - - self.assert_filter_result(cpf, expected_data) - - ### Tests for Generic Filters - - def test_field_modifier(self): - # another version of FieldDoubler - fm = FieldModifier(['a', 'c'], lambda x: x*2) - - # check against expected data - expected_data = [{'a':2, 'b':2, 'c':6}, - {'a':10, 'b':5, 'c':10}, - {'a':2, 'b':10, 'c':200}] - self.assert_filter_result(fm, expected_data) - - def test_field_keeper(self): - fk = FieldKeeper(['c']) - - # check against expected results - expected_data = [{'c':3}, {'c':5}, {'c':100}] - self.assert_filter_result(fk, expected_data) - - def test_field_remover(self): - fr = FieldRemover(['a', 'b']) - - # check against expected results - expected_data = [{'c':3}, {'c':5}, {'c':100}] - self.assert_filter_result(fr, expected_data) - - def test_field_merger(self): - fm = FieldMerger({'sum':('a','b','c')}, lambda x,y,z: x+y+z) - - # check against expected results - expected_data = [{'sum':6}, {'sum':15}, {'sum':111}] - self.assert_filter_result(fm, expected_data) - - def test_field_merger_keep_fields(self): - fm = FieldMerger({'sum':('a','b','c')}, lambda x,y,z: x+y+z, - keep_fields=True) - - # check against expected results - expected_data = [{'a':1, 'b':2, 'c':3, 'sum':6}, - {'a':5, 'b':5, 'c':5, 'sum':15}, - {'a':1, 'b':10, 'c':100, 'sum': 111}] - self.assert_filter_result(fm, expected_data) - - def test_field_adder_scalar(self): - fa = FieldAdder('x', 7) - - expected_data = [{'a':1, 'b':2, 'c':3, 'x':7}, - {'a':5, 'b':5, 'c':5, 'x':7}, - {'a':1, 'b':10, 'c':100, 'x': 7}] - self.assert_filter_result(fa, expected_data) - - def test_field_adder_callable(self): - fa = FieldAdder('x', lambda: 7) - - expected_data = [{'a':1, 'b':2, 'c':3, 'x':7}, - {'a':5, 'b':5, 'c':5, 'x':7}, - {'a':1, 'b':10, 'c':100, 'x': 7}] - self.assert_filter_result(fa, expected_data) - - def test_field_adder_iterable(self): - fa = FieldAdder('x', [1,2,3]) - - expected_data = [{'a':1, 'b':2, 'c':3, 'x':1}, - {'a':5, 'b':5, 'c':5, 'x':2}, - {'a':1, 'b':10, 'c':100, 'x': 3}] - self.assert_filter_result(fa, expected_data) - - def test_field_adder_replace(self): - fa = FieldAdder('b', lambda: 7) - - expected_data = [{'a':1, 'b':7, 'c':3}, - {'a':5, 'b':7, 'c':5}, - {'a':1, 'b':7, 'c':100}] - self.assert_filter_result(fa, expected_data) - - def test_field_adder_no_replace(self): - fa = FieldAdder('b', lambda: 7, replace=False) - - expected_data = [{'a':1, 'b':2, 'c':3}, - {'a':5, 'b':5, 'c':5}, - {'a':1, 'b':10, 'c':100}] - self.assert_filter_result(fa, expected_data) - - def test_field_copier(self): - fc = FieldCopier({'a2':'a', 'b2':'b'}) - - expected_data = [{'a':1, 'b':2, 'c':3, 'a2':1, 'b2':2}, - {'a':5, 'b':5, 'c':5, 'a2':5, 'b2':5}, - {'a':1, 'b':10, 'c':100, 'a2': 1, 'b2': 10}] - self.assert_filter_result(fc, expected_data) - - def test_field_renamer(self): - fr = FieldRenamer({'x':'a', 'y':'b'}) - - expected_data = [{'x':1, 'y':2, 'c':3}, - {'x':5, 'y':5, 'c':5}, - {'x':1, 'y':10, 'c':100}] - self.assert_filter_result(fr, expected_data) - - # TODO: splitter & flattner tests? - - def test_unique_filter(self): - u = Unique() - in_data = [{'a': 77}, {'a':33}, {'a': 77}] - expected_data = [{'a': 77}, {'a':33}] - result = u.attach(in_data) - - self.assertEqual(list(result), expected_data) - - # TODO: unicode & string filter tests - -if __name__ == '__main__': - unittest.main() diff --git a/saucebrush/tests/recipes.py b/saucebrush/tests/recipes.py deleted file mode 100644 index 288e6e7..0000000 --- a/saucebrush/tests/recipes.py +++ /dev/null @@ -1,53 +0,0 @@ -import doctest -import unittest -from saucebrush import Recipe, run_recipe, SaucebrushError, OvercookedError -from saucebrush.filters import Filter - - -class Raiser(Filter): - def process_record(self, record): - raise Exception("bad record") - - -class Saver(Filter): - def __init__(self): - self.saved = [] - - def process_record(self, record): - self.saved.append(record) - return record - - -class RecipeTestCase(unittest.TestCase): - def test_error_stream(self): - saver = Saver() - recipe = Recipe(Raiser(), error_stream=saver) - recipe.run([{'a': 1}, {'b': 2}]) - recipe.done() - - self.assertEqual(saver.saved[0]['record'], {'a': 1}) - self.assertEqual(saver.saved[1]['record'], {'b': 2}) - - # Must pass either a Recipe, a Filter or an iterable of Filters - # as the error_stream argument - self.assertRaises(SaucebrushError, Recipe, error_stream=5) - - def test_run_recipe(self): - saver = Saver() - run_recipe([1, 2], saver) - - self.assertEqual(saver.saved, [1, 2]) - - def test_done(self): - saver = Saver() - recipe = Recipe(saver) - recipe.run([1]) - recipe.done() - - self.assertRaises(OvercookedError, recipe.run, [2]) - self.assertRaises(OvercookedError, recipe.done) - self.assertEqual(saver.saved, [1]) - - -if __name__ == '__main__': - unittest.main() diff --git a/saucebrush/tests/sources.py b/saucebrush/tests/sources.py deleted file mode 100644 index f190952..0000000 --- a/saucebrush/tests/sources.py +++ /dev/null @@ -1,87 +0,0 @@ -from __future__ import unicode_literals -from io import BytesIO, StringIO -import unittest - -from saucebrush.sources import ( - CSVSource, FixedWidthFileSource, HtmlTableSource, JSONSource) - -class SourceTestCase(unittest.TestCase): - - def _get_csv(self): - data = '''a,b,c -1,2,3 -5,5,5 -1,10,100''' - return StringIO(data) - - def test_csv_source_basic(self): - source = CSVSource(self._get_csv()) - expected_data = [{'a':'1', 'b':'2', 'c':'3'}, - {'a':'5', 'b':'5', 'c':'5'}, - {'a':'1', 'b':'10', 'c':'100'}] - self.assertEqual(list(source), expected_data) - - def test_csv_source_fieldnames(self): - source = CSVSource(self._get_csv(), ['x','y','z']) - expected_data = [{'x':'a', 'y':'b', 'z':'c'}, - {'x':'1', 'y':'2', 'z':'3'}, - {'x':'5', 'y':'5', 'z':'5'}, - {'x':'1', 'y':'10', 'z':'100'}] - self.assertEqual(list(source), expected_data) - - def test_csv_source_skiprows(self): - source = CSVSource(self._get_csv(), skiprows=1) - expected_data = [{'a':'5', 'b':'5', 'c':'5'}, - {'a':'1', 'b':'10', 'c':'100'}] - self.assertEqual(list(source), expected_data) - - def test_fixed_width_source(self): - data = StringIO('JamesNovember 3 1986\nTim September151999') - fields = (('name',5), ('month',9), ('day',2), ('year',4)) - source = FixedWidthFileSource(data, fields) - expected_data = [{'name':'James', 'month':'November', 'day':'3', - 'year':'1986'}, - {'name':'Tim', 'month':'September', 'day':'15', - 'year':'1999'}] - self.assertEqual(list(source), expected_data) - - def test_json_source(self): - - content = StringIO("""[{"a": 1, "b": "2", "c": 3}]""") - - js = JSONSource(content) - self.assertEqual(list(js), [{'a': 1, 'b': '2', 'c': 3}]) - - def test_html_table_source(self): - - content = StringIO(""" - - - - - - - - - - - - -
abc
123
- - """) - - try: - - import lxml - - hts = HtmlTableSource(content, 'thetable') - self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}]) - - except ImportError: - # Python 2.6 doesn't have skipTest. We'll just suffer without it. - if hasattr(self, 'skipTest'): - self.skipTest("lxml is not installed") - -if __name__ == '__main__': - unittest.main() diff --git a/saucebrush/tests/stats.py b/saucebrush/tests/stats.py deleted file mode 100644 index 37a2933..0000000 --- a/saucebrush/tests/stats.py +++ /dev/null @@ -1,52 +0,0 @@ -import unittest -from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram - -class StatsTestCase(unittest.TestCase): - - def _simple_data(self): - return [{'a':1, 'b':2, 'c':3}, - {'a':5, 'b':5, 'c':5}, - {'a':1, 'b':10, 'c':100}] - - def test_sum(self): - fltr = Sum('b') - list(fltr.attach(self._simple_data())) - self.assertEqual(fltr.value(), 17) - - def test_average(self): - fltr = Average('c') - list(fltr.attach(self._simple_data())) - self.assertEqual(fltr.value(), 36.0) - - def test_median(self): - # odd number of values - fltr = Median('a') - list(fltr.attach(self._simple_data())) - self.assertEqual(fltr.value(), 1) - - # even number of values - fltr = Median('a') - list(fltr.attach(self._simple_data()[:2])) - self.assertEqual(fltr.value(), 3) - - def test_minmax(self): - fltr = MinMax('b') - list(fltr.attach(self._simple_data())) - self.assertEqual(fltr.value(), (2, 10)) - - def test_standard_deviation(self): - fltr = StandardDeviation('c') - list(fltr.attach(self._simple_data())) - self.assertEqual(fltr.average(), 36.0) - self.assertEqual(fltr.median(), 5) - self.assertEqual(fltr.value(), (55.4346462061408, 3073.0)) - self.assertEqual(fltr.value(True), (45.2621990922521, 2048.6666666666665)) - - def test_histogram(self): - fltr = Histogram('a') - fltr.label_length = 1 - list(fltr.attach(self._simple_data())) - self.assertEqual(str(fltr), "\n1 **\n5 *\n") - -if __name__ == '__main__': - unittest.main() diff --git a/setup.py b/setup.py deleted file mode 100644 index 99a322e..0000000 --- a/setup.py +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env python -from setuptools import setup - -setup(name="saucebrush", - version='0.5.0-dev', - packages=['saucebrush'], - ) diff --git a/saucebrush/__init__.py b/src/saucebrush/__init__.py similarity index 67% rename from saucebrush/__init__.py rename to src/saucebrush/__init__.py index d900a21..03201e9 100644 --- a/saucebrush/__init__.py +++ b/src/saucebrush/__init__.py @@ -2,7 +2,7 @@ Saucebrush is a data loading & manipulation framework written in python. """ -from . import filters, emitters, sources, utils +from . import filters, emitters, sources, utils # noqa class SaucebrushError(Exception): @@ -13,39 +13,39 @@ class OvercookedError(Exception): """ Exception for trying to operate on a Recipe that has been finished. """ + pass -class Recipe(object): - +class Recipe: def __init__(self, *filter_args, **kwargs): self.finished = False self.filters = [] for filter in filter_args: - if hasattr(filter, 'filters'): + if hasattr(filter, "filters"): self.filters.extend(filter.filters) else: self.filters.append(filter) - self.error_stream = kwargs.get('error_stream') + self.error_stream = kwargs.get("error_stream") if self.error_stream and not isinstance(self.error_stream, Recipe): if isinstance(self.error_stream, filters.Filter): self.error_stream = Recipe(self.error_stream) - elif hasattr(self.error_stream, '__iter__'): + elif hasattr(self.error_stream, "__iter__"): self.error_stream = Recipe(*self.error_stream) else: - raise SaucebrushError('error_stream must be either a filter' - ' or an iterable of filters') + raise SaucebrushError( + "error_stream must be either a filter" " or an iterable of filters" + ) def reject_record(self, record, exception): if self.error_stream: - self.error_stream.run([{'record': record, - 'exception': repr(exception)}]) + self.error_stream.run([{"record": record, "exception": repr(exception)}]) def run(self, source): if self.finished: - raise OvercookedError('run() called on finished recipe') + raise OvercookedError("run() called on finished recipe") # connect datapath data = source @@ -58,7 +58,7 @@ class Recipe(object): def done(self): if self.finished: - raise OvercookedError('done() called on finished recipe') + raise OvercookedError("done() called on finished recipe") self.finished = True @@ -70,12 +70,11 @@ class Recipe(object): try: filter_.done() except AttributeError: - pass # don't care if there isn't a done method + pass # don't care if there isn't a done method def run_recipe(source, *filter_args, **kwargs): - """ Process data, taking it from a source and applying any number of filters - """ + """Process data, taking it from a source and applying any number of filters""" r = Recipe(*filter_args, **kwargs) r.run(source) diff --git a/saucebrush/emitters.py b/src/saucebrush/emitters.py similarity index 52% rename from saucebrush/emitters.py rename to src/saucebrush/emitters.py index b165407..1eae2c9 100644 --- a/saucebrush/emitters.py +++ b/src/saucebrush/emitters.py @@ -2,49 +2,53 @@ Saucebrush Emitters are filters that instead of modifying the record, output it in some manner. """ -from __future__ import unicode_literals from saucebrush.filters import Filter + class Emitter(Filter): - """ ABC for emitters + """ABC for emitters - All derived emitters must provide an emit_record(self, record) that - takes a single record (python dictionary). + All derived emitters must provide an emit_record(self, record) that + takes a single record (python dictionary). - Emitters can optionally define a done() method that is called after - all records are processed (allowing database flushes, or printing of - aggregate data). + Emitters can optionally define a done() method that is called after + all records are processed (allowing database flushes, or printing of + aggregate data). """ + def process_record(self, record): self.emit_record(record) return record def emit_record(self, record): - """ Abstract method to be overridden. + """Abstract method to be overridden. - Called with a single record, should "emit" the record unmodified. + Called with a single record, should "emit" the record unmodified. """ - raise NotImplementedError('emit_record not defined in ' + - self.__class__.__name__) + raise NotImplementedError( + "emit_record not defined in " + self.__class__.__name__ + ) def done(self): - """ No-op Method to be overridden. + """No-op Method to be overridden. - Called when all processing is complete + Called when all processing is complete """ pass class DebugEmitter(Emitter): - """ Emitter that prints raw records to a file, useful for debugging. + """Emitter that prints raw records to a file, useful for debugging. - DebugEmitter() by default prints to stdout. - DebugEmitter(open('test', 'w')) would print to a file named test + DebugEmitter() by default prints to stdout. + DebugEmitter(open('test', 'w')) would print to a file named test """ + def __init__(self, outfile=None): - super(DebugEmitter, self).__init__() + super().__init__() if not outfile: import sys + self._outfile = sys.stdout else: self._outfile = outfile @@ -54,20 +58,21 @@ class DebugEmitter(Emitter): class CountEmitter(Emitter): - """ Emitter that writes the record count to a file-like object. + """Emitter that writes the record count to a file-like object. - CountEmitter() by default writes to stdout. - CountEmitter(outfile=open('text', 'w')) would print to a file name test. - CountEmitter(every=1000000) would write the count every 1,000,000 records. - CountEmitter(every=100, of=2000) would write " of 2000" every 100 records. + CountEmitter() by default writes to stdout. + CountEmitter(outfile=open('text', 'w')) would print to a file name test. + CountEmitter(every=1000000) would write the count every 1,000,000 records. + CountEmitter(every=100, of=2000) would write " of 2000" every 100 records. """ def __init__(self, every=1000, of=None, outfile=None, format=None): - super(CountEmitter, self).__init__() + super().__init__() if not outfile: import sys + self._outfile = sys.stdout else: self._outfile = outfile @@ -84,7 +89,7 @@ class CountEmitter(Emitter): self.count = 0 def format(self): - return self._format % {'count': self.count, 'of': self._of} + return self._format % {"count": self.count, "of": self._of} def emit_record(self, record): self.count += 1 @@ -96,15 +101,16 @@ class CountEmitter(Emitter): class CSVEmitter(Emitter): - """ Emitter that writes records to a CSV file. + """Emitter that writes records to a CSV file. - CSVEmitter(open('output.csv','w'), ('id', 'name', 'phone')) writes all - records to a csvfile with the columns in the order specified. + CSVEmitter(open('output.csv','w'), ('id', 'name', 'phone')) writes all + records to a csvfile with the columns in the order specified. """ def __init__(self, csvfile, fieldnames): - super(CSVEmitter, self).__init__() + super().__init__() import csv + self._dictwriter = csv.DictWriter(csvfile, fieldnames) # write header row header_row = dict(zip(fieldnames, fieldnames)) @@ -115,36 +121,43 @@ class CSVEmitter(Emitter): class SqliteEmitter(Emitter): - """ Emitter that writes records to a SQLite database. + """Emitter that writes records to a SQLite database. - SqliteEmitter('addressbook.db', 'friend') writes all records to the - friends table in the SQLite database named addressbook.db + SqliteEmitter('addressbook.db', 'friend') writes all records to the + friends table in the SQLite database named addressbook.db - (To have the emitter create the table, the fieldnames should be passed - as a third parameter to SqliteEmitter.) + (To have the emitter create the table, the fieldnames should be passed + as a third parameter to SqliteEmitter.) """ def __init__(self, dbname, table_name, fieldnames=None, replace=False, quiet=False): - super(SqliteEmitter, self).__init__() + super().__init__() import sqlite3 + self._conn = sqlite3.connect(dbname) self._cursor = self._conn.cursor() self._table_name = table_name self._replace = replace self._quiet = quiet if fieldnames: - create = "CREATE TABLE IF NOT EXISTS %s (%s)" % (table_name, - ', '.join([' '.join((field, 'TEXT')) for field in fieldnames])) + create = "CREATE TABLE IF NOT EXISTS %s (%s)" % ( + table_name, + ", ".join([" ".join((field, "TEXT")) for field in fieldnames]), + ) self._cursor.execute(create) def emit_record(self, record): import sqlite3 + # input should be escaped with ? if data isn't trusted - qmarks = ','.join(('?',) * len(record)) - insert = 'INSERT OR REPLACE' if self._replace else 'INSERT' - insert = '%s INTO %s (%s) VALUES (%s)' % (insert, self._table_name, - ','.join(record.keys()), - qmarks) + qmarks = ",".join(("?",) * len(record)) + insert = "INSERT OR REPLACE" if self._replace else "INSERT" + insert = "%s INTO %s (%s) VALUES (%s)" % ( + insert, + self._table_name, + ",".join(record.keys()), + qmarks, + ) try: self._cursor.execute(insert, list(record.values())) except sqlite3.IntegrityError as ie: @@ -158,26 +171,29 @@ class SqliteEmitter(Emitter): class SqlDumpEmitter(Emitter): - """ Emitter that writes SQL INSERT statements. + """Emitter that writes SQL INSERT statements. - The output generated by the SqlDumpEmitter is intended to be used to - populate a mySQL database. + The output generated by the SqlDumpEmitter is intended to be used to + populate a mySQL database. - SqlDumpEmitter(open('addresses.sql', 'w'), 'friend', ('name', 'phone')) - writes statements to addresses.sql to insert the data - into the friends table. + SqlDumpEmitter(open('addresses.sql', 'w'), 'friend', ('name', 'phone')) + writes statements to addresses.sql to insert the data + into the friends table. """ def __init__(self, outfile, table_name, fieldnames): - super(SqlDumpEmitter, self).__init__() + super().__init__() self._fieldnames = fieldnames if not outfile: import sys + self._outfile = sys.stderr else: self._outfile = outfile self._insert_str = "INSERT INTO `%s` (`%s`) VALUES (%%s);\n" % ( - table_name, '`,`'.join(fieldnames)) + table_name, + "`,`".join(fieldnames), + ) def quote(self, item): @@ -190,29 +206,31 @@ class SqlDumpEmitter(Emitter): types = (str,) if isinstance(item, types): - item = item.replace("\\","\\\\").replace("'","\\'").replace(chr(0),'0') + item = item.replace("\\", "\\\\").replace("'", "\\'").replace(chr(0), "0") return "'%s'" % item return "%s" % item def emit_record(self, record): quoted_data = [self.quote(record[field]) for field in self._fieldnames] - self._outfile.write(self._insert_str % ','.join(quoted_data)) + self._outfile.write(self._insert_str % ",".join(quoted_data)) class DjangoModelEmitter(Emitter): - """ Emitter that populates a table corresponding to a django model. + """Emitter that populates a table corresponding to a django model. - Takes a django settings file, app label and model name and uses django - to insert the records into the appropriate table. + Takes a django settings file, app label and model name and uses django + to insert the records into the appropriate table. - DjangoModelEmitter('settings.py', 'addressbook', 'friend') writes - records to addressbook.models.friend model using database settings - from settings.py. + DjangoModelEmitter('settings.py', 'addressbook', 'friend') writes + records to addressbook.models.friend model using database settings + from settings.py. """ + def __init__(self, dj_settings, app_label, model_name): - super(DjangoModelEmitter, self).__init__() + super().__init__() from saucebrush.utils import get_django_model + self._dbmodel = get_django_model(dj_settings, app_label, model_name) if not self._dbmodel: raise Exception("No such model: %s %s" % (app_label, model_name)) @@ -222,19 +240,30 @@ class DjangoModelEmitter(Emitter): class MongoDBEmitter(Emitter): - """ Emitter that creates a document in a MongoDB datastore + """Emitter that creates a document in a MongoDB datastore - The names of the database and collection in which the records will - be inserted are required parameters. The host and port are optional, - defaulting to 'localhost' and 27017, repectively. + The names of the database and collection in which the records will + be inserted are required parameters. The host and port are optional, + defaulting to 'localhost' and 27017, repectively. """ - def __init__(self, database, collection, host='localhost', port=27017, drop_collection=False, conn=None): - super(MongoDBEmitter, self).__init__() + + def __init__( + self, + database, + collection, + host="localhost", + port=27017, + drop_collection=False, + conn=None, + ): + super().__init__() from pymongo.database import Database + if not isinstance(database, Database): if not conn: from pymongo.connection import Connection + conn = Connection(host, port) db = conn[database] else: @@ -249,16 +278,17 @@ class MongoDBEmitter(Emitter): class LoggingEmitter(Emitter): - """ Emitter that logs to a Python logging.Logger instance. + """Emitter that logs to a Python logging.Logger instance. - The msg_template will be passed the record being emitted as - a format parameter. The resulting message will get logged - at the provided level. + The msg_template will be passed the record being emitted as + a format parameter. The resulting message will get logged + at the provided level. """ + import logging def __init__(self, logger, msg_template, level=logging.DEBUG): - super(LoggingEmitter, self).__init__() + super().__init__() self.logger = logger self.msg_template = msg_template self.level = level diff --git a/saucebrush/filters.py b/src/saucebrush/filters.py similarity index 54% rename from saucebrush/filters.py rename to src/saucebrush/filters.py index c942863..3e1a4ab 100644 --- a/saucebrush/filters.py +++ b/src/saucebrush/filters.py @@ -12,26 +12,28 @@ import re import time ###################### -## Abstract Filters ## +# Abstract Filters # ###################### -class Filter(object): - """ ABC for filters that operate on records. - All derived filters must provide a process_record(self, record) that - takes a single record (python dictionary) and returns a result. +class Filter: + """ABC for filters that operate on records. + + All derived filters must provide a process_record(self, record) that + takes a single record (python dictionary) and returns a result. """ def process_record(self, record): - """ Abstract method to be overridden. + """Abstract method to be overridden. - Called with a single record, should return modified record. + Called with a single record, should return modified record. """ - raise NotImplementedError('process_record not defined in ' + - self.__class__.__name__) + raise NotImplementedError( + "process_record not defined in " + self.__class__.__name__ + ) def reject_record(self, record, exception): - recipe = getattr(self, '_recipe') + recipe = getattr(self, "_recipe") if recipe: recipe.reject_record(record, exception) @@ -47,11 +49,11 @@ class Filter(object): class YieldFilter(Filter): - """ ABC for defining filters where process_record yields. + """ABC for defining filters where process_record yields. - If process_record cannot return exactly one result for every record - it is passed, it should yield back as many records as needed and the - filter must derive from YieldFilter. + If process_record cannot return exactly one result for every record + it is passed, it should yield back as many records as needed and the + filter must derive from YieldFilter. """ def attach(self, source, recipe=None): @@ -65,19 +67,19 @@ class YieldFilter(Filter): class FieldFilter(Filter): - """ ABC for filters that do a single operation on individual fields. + """ABC for filters that do a single operation on individual fields. - All derived filters must provide a process_field(self, item) that - returns a modified item. process_field is called on one or more keys - passed into __init__. + All derived filters must provide a process_field(self, item) that + returns a modified item. process_field is called on one or more keys + passed into __init__. """ def __init__(self, keys): - super(FieldFilter, self).__init__() + super().__init__() self._target_keys = utils.str_or_list(keys) def process_record(self, record): - """ Calls process_field on all keys passed to __init__. """ + """Calls process_field on all keys passed to __init__.""" for key in self._target_keys: try: @@ -89,29 +91,31 @@ class FieldFilter(Filter): return record def process_field(self, item): - """ Given a value, return the value that it should be replaced with. """ + """Given a value, return the value that it should be replaced with.""" - raise NotImplementedError('process_field not defined in ' + - self.__class__.__name__) + raise NotImplementedError( + "process_field not defined in " + self.__class__.__name__ + ) def __unicode__(self): - return '%s( %s )' % (self.__class__.__name__, str(self._target_keys)) + return "%s( %s )" % (self.__class__.__name__, str(self._target_keys)) + class ConditionalFilter(YieldFilter): - """ ABC for filters that only pass through records meeting a condition. + """ABC for filters that only pass through records meeting a condition. - All derived filters must provide a test_record(self, record) that - returns True or False -- True indicating that the record should be - passed through, and False preventing pass through. + All derived filters must provide a test_record(self, record) that + returns True or False -- True indicating that the record should be + passed through, and False preventing pass through. - If validator is True then raises a ValidationError instead of - silently dropping records that fail test_record. + If validator is True then raises a ValidationError instead of + silently dropping records that fail test_record. """ validator = False def process_record(self, record): - """ Yields all records for which self.test_record is true """ + """Yields all records for which self.test_record is true""" if self.test_record(record): yield record @@ -119,41 +123,45 @@ class ConditionalFilter(YieldFilter): raise ValidationError(record) def test_record(self, record): - """ Given a record, return True iff it should be passed on """ - raise NotImplementedError('test_record not defined in ' + - self.__class__.__name__) + """Given a record, return True iff it should be passed on""" + raise NotImplementedError( + "test_record not defined in " + self.__class__.__name__ + ) + class ValidationError(Exception): def __init__(self, record): - super(ValidationError, self).__init__(repr(record)) + super().__init__(repr(record)) self.record = record + def _dotted_get(d, path): """ - utility function for SubrecordFilter + utility function for SubrecordFilter - dives into a complex nested dictionary with paths like a.b.c + dives into a complex nested dictionary with paths like a.b.c """ if path: - key_pieces = path.split('.', 1) + key_pieces = path.split(".", 1) piece = d[key_pieces[0]] if isinstance(piece, (tuple, list)): - return [_dotted_get(i, '.'.join(key_pieces[1:])) for i in piece] + return [_dotted_get(i, ".".join(key_pieces[1:])) for i in piece] elif isinstance(piece, (dict)): - return _dotted_get(piece, '.'.join(key_pieces[1:])) + return _dotted_get(piece, ".".join(key_pieces[1:])) else: return d -class SubrecordFilter(Filter): - """ Filter that calls another filter on subrecord(s) of a record - Takes a dotted path (eg. a.b.c) and instantiated filter and runs that - filter on all subrecords found at the path. +class SubrecordFilter(Filter): + """Filter that calls another filter on subrecord(s) of a record + + Takes a dotted path (eg. a.b.c) and instantiated filter and runs that + filter on all subrecords found at the path. """ def __init__(self, field_path, filter_): - if '.' in field_path: - self.field_path, self.key = field_path.rsplit('.', 1) + if "." in field_path: + self.field_path, self.key = field_path.rsplit(".", 1) else: self.field_path = None self.key = field_path @@ -178,8 +186,9 @@ class SubrecordFilter(Filter): self.process_subrecord(subrecord_parent) return record + class ConditionalPathFilter(Filter): - """ Filter that uses a predicate to split input among two filter paths. """ + """Filter that uses a predicate to split input among two filter paths.""" def __init__(self, predicate_func, true_filter, false_filter): self.predicate_func = predicate_func @@ -192,38 +201,43 @@ class ConditionalPathFilter(Filter): else: return self.false_filter.process_record(record) + ##################### -## Generic Filters ## +# Generic Filters # ##################### + class FieldModifier(FieldFilter): - """ Filter that calls a given function on a given set of fields. + """Filter that calls a given function on a given set of fields. - FieldModifier(('spam','eggs'), abs) to call the abs method on the spam - and eggs fields in each record filtered. + FieldModifier(('spam','eggs'), abs) to call the abs method on the spam + and eggs fields in each record filtered. """ def __init__(self, keys, func): - super(FieldModifier, self).__init__(keys) + super().__init__(keys) self._filter_func = func def process_field(self, item): return self._filter_func(item) - def __unicode__(self): - return '%s( %s, %s )' % (self.__class__.__name__, - str(self._target_keys), str(self._filter_func)) + def __str__(self): + return "%s( %s, %s )" % ( + self.__class__.__name__, + str(self._target_keys), + str(self._filter_func), + ) class FieldKeeper(Filter): - """ Filter that removes all but the given set of fields. + """Filter that removes all but the given set of fields. - FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs - fields from every record filtered. + FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs + fields from every record filtered. """ def __init__(self, keys): - super(FieldKeeper, self).__init__() + super().__init__() self._target_keys = utils.str_or_list(keys) def process_record(self, record): @@ -234,14 +248,14 @@ class FieldKeeper(Filter): class FieldRemover(Filter): - """ Filter that removes a given set of fields. + """Filter that removes a given set of fields. - FieldRemover(('spam', 'eggs')) removes the spam and eggs fields from - every record filtered. + FieldRemover(('spam', 'eggs')) removes the spam and eggs fields from + every record filtered. """ def __init__(self, keys): - super(FieldRemover, self).__init__() + super().__init__() self._target_keys = utils.str_or_list(keys) def process_record(self, record): @@ -249,21 +263,21 @@ class FieldRemover(Filter): record.pop(key, None) return record - def __unicode__(self): - return '%s( %s )' % (self.__class__.__name__, str(self._target_keys)) + def __str__(self): + return "%s( %s )" % (self.__class__.__name__, str(self._target_keys)) class FieldMerger(Filter): - """ Filter that merges a given set of fields using a supplied merge_func. + """Filter that merges a given set of fields using a supplied merge_func. - Takes a mapping (dictionary of new_column:(from_col1,from_col2)) + Takes a mapping (dictionary of new_column:(from_col1,from_col2)) - FieldMerger({"bacon": ("spam", "eggs")}, operator.add) creates a new - column bacon that is the result of spam+eggs + FieldMerger({"bacon": ("spam", "eggs")}, operator.add) creates a new + column bacon that is the result of spam+eggs """ def __init__(self, mapping, merge_func, keep_fields=False): - super(FieldMerger, self).__init__() + super().__init__() self._field_mapping = mapping self._merge_func = merge_func self._keep_fields = keep_fields @@ -277,30 +291,32 @@ class FieldMerger(Filter): record[to_col] = self._merge_func(*values) return record - def __unicode__(self): - return '%s( %s, %s )' % (self.__class__.__name__, - str(self._field_mapping), - str(self._merge_func)) + def __str__(self): + return "%s( %s, %s )" % ( + self.__class__.__name__, + str(self._field_mapping), + str(self._merge_func), + ) class FieldAdder(Filter): - """ Filter that adds a new field. + """Filter that adds a new field. - Takes a name for the new field and a value, field_value can be an - iterable, a function, or a static value. + Takes a name for the new field and a value, field_value can be an + iterable, a function, or a static value. - from itertools import count - FieldAdder('id', count) + from itertools import count + FieldAdder('id', count) - would yield a new column named id that uses the itertools count iterable - to create sequentially numbered ids. + would yield a new column named id that uses the itertools count iterable + to create sequentially numbered ids. """ def __init__(self, field_name, field_value, replace=True): - super(FieldAdder, self).__init__() + super().__init__() self._field_name = field_name self._field_value = field_value - if hasattr(self._field_value, '__iter__'): + if hasattr(self._field_value, "__iter__"): value_iter = iter(self._field_value) if hasattr(value_iter, "next"): self._field_value = value_iter.next @@ -317,17 +333,22 @@ class FieldAdder(Filter): return record def __unicode__(self): - return '%s( %s, %s )' % (self.__class__.__name__, self._field_name, - str(self._field_value)) + return "%s( %s, %s )" % ( + self.__class__.__name__, + self._field_name, + str(self._field_value), + ) + class FieldCopier(Filter): - """ Filter that copies one field to another. + """Filter that copies one field to another. - Takes a dictionary mapping destination keys to source keys. + Takes a dictionary mapping destination keys to source keys. """ + def __init__(self, copy_mapping): - super(FieldCopier, self).__init__() + super().__init__() self._copy_mapping = copy_mapping def process_record(self, record): @@ -336,13 +357,15 @@ class FieldCopier(Filter): record[dest] = record[source] return record -class FieldRenamer(Filter): - """ Filter that renames one field to another. - Takes a dictionary mapping destination keys to source keys. +class FieldRenamer(Filter): + """Filter that renames one field to another. + + Takes a dictionary mapping destination keys to source keys. """ + def __init__(self, rename_mapping): - super(FieldRenamer, self).__init__() + super().__init__() self._rename_mapping = rename_mapping def process_record(self, record): @@ -351,15 +374,16 @@ class FieldRenamer(Filter): record[dest] = record.pop(source) return record -class FieldNameModifier(Filter): - """ Filter that calls a given function on a given set of fields. - FieldNameModifier(('spam','eggs'), abs) to call the abs method on the spam - and eggs field names in each record filtered. +class FieldNameModifier(Filter): + """Filter that calls a given function on a given set of fields. + + FieldNameModifier(('spam','eggs'), abs) to call the abs method on the spam + and eggs field names in each record filtered. """ def __init__(self, func): - super(FieldNameModifier, self).__init__() + super().__init__() self._filter_func = func def process_record(self, record): @@ -368,19 +392,20 @@ class FieldNameModifier(Filter): record[dest] = record.pop(source) return record + class Splitter(Filter): - """ Filter that splits nested data into different paths. + """Filter that splits nested data into different paths. - Takes a dictionary of keys and a series of filters to run against the - associated dictionaries. + Takes a dictionary of keys and a series of filters to run against the + associated dictionaries. - {'person': {'firstname': 'James', 'lastname': 'Turk'}, - 'phones': [{'phone': '222-222-2222'}, {'phone': '335-333-3321'}] - } + {'person': {'firstname': 'James', 'lastname': 'Turk'}, + 'phones': [{'phone': '222-222-2222'}, {'phone': '335-333-3321'}] + } """ def __init__(self, split_mapping): - super(Splitter, self).__init__() + super().__init__() self._split_mapping = split_mapping def process_record(self, record): @@ -409,21 +434,22 @@ class Splitter(Filter): class Flattener(FieldFilter): - """ Collapse a set of similar dictionaries into a list. + """Collapse a set of similar dictionaries into a list. - Takes a dictionary of keys and flattens the key names: + Takes a dictionary of keys and flattens the key names: - addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}}, - {'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}] - flattener = Flattener(['addresses']) + addresses = [{'addresses': [{'address': {'state':'NC', 'street':'146 shirley drive'}}, + {'address': {'state':'NY', 'street':'3000 Winton Rd'}}]}] + flattener = Flattener(['addresses']) - would yield: + would yield: - {'addresses': [{'state': 'NC', 'street': '146 shirley drive'}, - {'state': 'NY', 'street': '3000 Winton Rd'}]} + {'addresses': [{'state': 'NC', 'street': '146 shirley drive'}, + {'state': 'NY', 'street': '3000 Winton Rd'}]} """ + def __init__(self, keys): - super(Flattener, self).__init__(keys) + super().__init__(keys) def process_field(self, item): result = [] @@ -436,8 +462,8 @@ class Flattener(FieldFilter): class DictFlattener(Filter): - def __init__(self, keys, separator='_'): - super(DictFlattener, self).__init__() + def __init__(self, keys, separator="_"): + super().__init__() self._keys = utils.str_or_list(keys) self._separator = separator @@ -446,11 +472,10 @@ class DictFlattener(Filter): class Unique(ConditionalFilter): - """ Filter that ensures that all records passing through are unique. - """ + """Filter that ensures that all records passing through are unique.""" def __init__(self): - super(Unique, self).__init__() + super().__init__() self._seen = set() def test_record(self, record): @@ -461,19 +486,20 @@ class Unique(ConditionalFilter): else: return False + class UniqueValidator(Unique): validator = True class UniqueID(ConditionalFilter): - """ Filter that ensures that all records through have a unique ID. + """Filter that ensures that all records through have a unique ID. - Takes the name of an ID field, or multiple field names in the case - of a composite ID. + Takes the name of an ID field, or multiple field names in the case + of a composite ID. """ - def __init__(self, field='id', *args): - super(UniqueID, self).__init__() + def __init__(self, field="id", *args): + super().__init__() self._seen = set() self._id_fields = [field] self._id_fields.extend(args) @@ -486,58 +512,30 @@ class UniqueID(ConditionalFilter): else: return False + class UniqueIDValidator(UniqueID): validator = True -class UnicodeFilter(Filter): - """ Convert all str elements in the record to Unicode. - """ - - def __init__(self, encoding='utf-8', errors='ignore'): - super(UnicodeFilter, self).__init__() - self._encoding = encoding - self._errors = errors - - def process_record(self, record): - for key, value in record.items(): - if isinstance(value, str): - record[key] = unicode(value, self._encoding, self._errors) - elif isinstance(value, unicode): - record[key] = value.decode(self._encoding, self._errors) - return record - -class StringFilter(Filter): - - def __init__(self, encoding='utf-8', errors='ignore'): - super(StringFilter, self).__init__() - self._encoding = encoding - self._errors = errors - - def process_record(self, record): - for key, value in record.items(): - if isinstance(value, unicode): - record[key] = value.encode(self._encoding, self._errors) - return record - - ########################### -## Commonly Used Filters ## +# Commonly Used Filters # ########################### + class PhoneNumberCleaner(FieldFilter): - """ Filter that cleans phone numbers to match a given format. + """Filter that cleans phone numbers to match a given format. - Takes a list of target keys and an optional phone # format that has - 10 %s placeholders. + Takes a list of target keys and an optional phone # format that has + 10 %s placeholders. - PhoneNumberCleaner( ('phone','fax'), number_format='%s%s%s-%s%s%s-%s%s%s%s') - would format the phone & fax columns to 555-123-4567 format. + PhoneNumberCleaner( ('phone','fax'), number_format='%s%s%s-%s%s%s-%s%s%s%s') + would format the phone & fax columns to 555-123-4567 format. """ - def __init__(self, keys, number_format='%s%s%s.%s%s%s.%s%s%s%s'): - super(PhoneNumberCleaner, self).__init__(keys) + + def __init__(self, keys, number_format="%s%s%s.%s%s%s.%s%s%s%s"): + super().__init__(keys) self._number_format = number_format - self._num_re = re.compile('\d') + self._num_re = re.compile(r"\d") def process_field(self, item): nums = self._num_re.findall(item) @@ -545,46 +543,54 @@ class PhoneNumberCleaner(FieldFilter): item = self._number_format % tuple(nums) return item -class DateCleaner(FieldFilter): - """ Filter that cleans dates to match a given format. - Takes a list of target keys and to and from formats in strftime format. +class DateCleaner(FieldFilter): + """Filter that cleans dates to match a given format. + + Takes a list of target keys and to and from formats in strftime format. """ + def __init__(self, keys, from_format, to_format): - super(DateCleaner, self).__init__(keys) + super().__init__(keys) self._from_format = from_format self._to_format = to_format def process_field(self, item): - return time.strftime(self._to_format, - time.strptime(item, self._from_format)) + return time.strftime(self._to_format, time.strptime(item, self._from_format)) + class NameCleaner(Filter): - """ Filter that splits names into a first, last, and middle name field. + """Filter that splits names into a first, last, and middle name field. - Takes a list of target keys. + Takes a list of target keys. - NameCleaner( ('name', ), nomatch_name='raw_name') - would attempt to split 'name' into firstname, middlename, lastname, - and suffix columns, and if it did not fit would place it in raw_name + NameCleaner( ('name', ), nomatch_name='raw_name') + would attempt to split 'name' into firstname, middlename, lastname, + and suffix columns, and if it did not fit would place it in raw_name """ # first middle? last suffix? - FIRST_LAST = re.compile('''^\s*(?:(?P\w+)(?:\.?) + FIRST_LAST = re.compile( + r"""^\s*(?:(?P\w+)(?:\.?) \s+(?:(?P\w+)\.?\s+)? (?P[A-Za-z'-]+)) (?:\s+(?PJR\.?|II|III|IV))? - \s*$''', re.VERBOSE | re.IGNORECASE) + \s*$""", + re.VERBOSE | re.IGNORECASE, + ) # last, first middle? suffix? - LAST_FIRST = re.compile('''^\s*(?:(?P[A-Za-z'-]+), + LAST_FIRST = re.compile( + r"""^\s*(?:(?P[A-Za-z'-]+), \s+(?P\w+)(?:\.?) (?:\s+(?P\w+)\.?)?) (?:\s+(?PJR\.?|II|III|IV))? - \s*$''', re.VERBOSE | re.IGNORECASE) + \s*$""", + re.VERBOSE | re.IGNORECASE, + ) - def __init__(self, keys, prefix='', formats=None, nomatch_name=None): - super(NameCleaner, self).__init__() + def __init__(self, keys, prefix="", formats=None, nomatch_name=None): + super().__init__() self._keys = utils.str_or_list(keys) self._name_prefix = prefix self._nomatch_name = nomatch_name @@ -605,7 +611,7 @@ class NameCleaner(Filter): # if there is a match, remove original name and add pieces if match: record.pop(key) - for k,v in match.groupdict().items(): + for k, v in match.groupdict().items(): record[self._name_prefix + k] = v break diff --git a/saucebrush/sources.py b/src/saucebrush/sources.py similarity index 50% rename from saucebrush/sources.py rename to src/saucebrush/sources.py index c265754..a44968f 100644 --- a/saucebrush/sources.py +++ b/src/saucebrush/sources.py @@ -4,27 +4,28 @@ All sources must implement the iterable interface and return python dictionaries. """ -from __future__ import unicode_literals import string from saucebrush import utils -class CSVSource(object): - """ Saucebrush source for reading from CSV files. - Takes an open csvfile, an optional set of fieldnames and optional number - of rows to skip. +class CSVSource: + """Saucebrush source for reading from CSV files. - CSVSource(open('test.csv')) will read a csvfile, using the first row as - the field names. + Takes an open csvfile, an optional set of fieldnames and optional number + of rows to skip. - CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read - in a CSV file and treat the three columns as name, phone, and address, - ignoring the first row (presumed to be column names). + CSVSource(open('test.csv')) will read a csvfile, using the first row as + the field names. + + CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read + in a CSV file and treat the three columns as name, phone, and address, + ignoring the first row (presumed to be column names). """ def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs): import csv + self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs) for _ in range(skiprows): next(self._dictreader) @@ -33,17 +34,17 @@ class CSVSource(object): return self._dictreader -class FixedWidthFileSource(object): - """ Saucebrush source for reading from fixed width field files. +class FixedWidthFileSource: + """Saucebrush source for reading from fixed width field files. - FixedWidthFileSource expects an open fixed width file and a tuple - of fields with their lengths. There is also an optional fillchars - command that is the filler characters to strip from the end of each - field. (defaults to whitespace) + FixedWidthFileSource expects an open fixed width file and a tuple + of fields with their lengths. There is also an optional fillchars + command that is the filler characters to strip from the end of each + field. (defaults to whitespace) - FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12))) - will read in a fixed width file where the first 30 characters of each - line are part of a name and the characters 31-42 are a phone number. + FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12))) + will read in a fixed width file where the first 30 characters of each + line are part of a name and the characters 31-42 are a phone number. """ def __init__(self, fwfile, fields, fillchars=string.whitespace): @@ -64,97 +65,98 @@ class FixedWidthFileSource(object): line = next(self._fwfile) record = {} for name, range_ in self._fields_dict.items(): - record[name] = line[range_[0]:range_[1]].rstrip(self._fillchars) + record[name] = line[range_[0] : range_[1]].rstrip(self._fillchars) return record - def next(self): - """ Keep Python 2 next() method that defers to __next__(). - """ - return self.__next__() +class HtmlTableSource: + """Saucebrush source for reading data from an HTML table. -class HtmlTableSource(object): - """ Saucebrush source for reading data from an HTML table. + HtmlTableSource expects an open html file, the id of the table or a + number indicating which table on the page to use, an optional fieldnames + tuple, and an optional number of rows to skip. - HtmlTableSource expects an open html file, the id of the table or a - number indicating which table on the page to use, an optional fieldnames - tuple, and an optional number of rows to skip. + HtmlTableSource(open('test.html'), 0) opens the first HTML table and + uses the first row as the names of the columns. - HtmlTableSource(open('test.html'), 0) opens the first HTML table and - uses the first row as the names of the columns. - - HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens - the HTML table with an id of 'people' and names the two columns - name and phone, skipping the first row where alternate names are - stored. + HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens + the HTML table with an id of 'people' and names the two columns + name and phone, skipping the first row where alternate names are + stored. """ def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0): # extract the table from lxml.html import parse + doc = parse(htmlfile).getroot() if isinstance(id_or_num, int): - table = doc.cssselect('table')[id_or_num] + table = doc.cssselect("table")[id_or_num] else: - table = doc.cssselect('table#%s' % id_or_num) + table = doc.cssselect("table#%s" % id_or_num) - table = table[0] # get the first table + table = table[0] # get the first table # skip the necessary number of rows - self._rows = table.cssselect('tr')[skiprows:] + self._rows = table.cssselect("tr")[skiprows:] # determine the fieldnames if not fieldnames: - self._fieldnames = [td.text_content() - for td in self._rows[0].cssselect('td, th')] + self._fieldnames = [ + td.text_content() for td in self._rows[0].cssselect("td, th") + ] skiprows += 1 else: self._fieldnames = fieldnames # skip the necessary number of rows - self._rows = table.cssselect('tr')[skiprows:] + self._rows = table.cssselect("tr")[skiprows:] def process_tr(self): for row in self._rows: - strings = [td.text_content() for td in row.cssselect('td')] + strings = [td.text_content() for td in row.cssselect("td")] yield dict(zip(self._fieldnames, strings)) def __iter__(self): return self.process_tr() -class DjangoModelSource(object): - """ Saucebrush source for reading data from django models. +class DjangoModelSource: + """Saucebrush source for reading data from django models. - DjangoModelSource expects a django settings file, app label, and model - name. The resulting records contain all columns in the table for the - specified model. + DjangoModelSource expects a django settings file, app label, and model + name. The resulting records contain all columns in the table for the + specified model. - DjangoModelSource('settings.py', 'phonebook', 'friend') would read all - friends from the friend model in the phonebook app described in - settings.py. + DjangoModelSource('settings.py', 'phonebook', 'friend') would read all + friends from the friend model in the phonebook app described in + settings.py. """ + def __init__(self, dj_settings, app_label, model_name): dbmodel = utils.get_django_model(dj_settings, app_label, model_name) # only get values defined in model (no extra fields from custom manager) - self._data = dbmodel.objects.values(*[f.name - for f in dbmodel._meta.fields]) + self._data = dbmodel.objects.values(*[f.name for f in dbmodel._meta.fields]) def __iter__(self): return iter(self._data) -class MongoDBSource(object): - """ Source for reading from a MongoDB database. +class MongoDBSource: + """Source for reading from a MongoDB database. - The record dict is populated with records matching the spec - from the specified database and collection. + The record dict is populated with records matching the spec + from the specified database and collection. """ - def __init__(self, database, collection, spec=None, host='localhost', port=27017, conn=None): + + def __init__( + self, database, collection, spec=None, host="localhost", port=27017, conn=None + ): if not conn: from pymongo.connection import Connection + conn = Connection(host, port) self.collection = conn[database][collection] self.spec = spec @@ -166,19 +168,21 @@ class MongoDBSource(object): for doc in self.collection.find(self.spec): yield dict(doc) + # dict_factory for sqlite source def dict_factory(cursor, row): - d = { } + d = {} for idx, col in enumerate(cursor.description): d[col[0]] = row[idx] return d -class SqliteSource(object): - """ Source that reads from a sqlite database. - The record dict is populated with the results from the - query argument. If given, args will be passed to the query - when executed. +class SqliteSource: + """Source that reads from a sqlite database. + + The record dict is populated with the results from the + query argument. If given, args will be passed to the query + when executed. """ def __init__(self, dbpath, query, args=None, conn_params=None): @@ -213,11 +217,11 @@ class SqliteSource(object): self._conn.close() -class FileSource(object): - """ Base class for sources which read from one or more files. +class FileSource: + """Base class for sources which read from one or more files. - Takes as input a file-like, a file path, a list of file-likes, - or a list of file paths. + Takes as input a file-like, a file path, a list of file-likes, + or a list of file paths. """ def __init__(self, input): @@ -226,34 +230,36 @@ class FileSource(object): def __iter__(self): # This method would be a lot cleaner with the proposed # 'yield from' expression (PEP 380) - if hasattr(self._input, '__read__') or hasattr(self._input, 'read'): + if hasattr(self._input, "__read__") or hasattr(self._input, "read"): for record in self._process_file(self._input): yield record elif isinstance(self._input, str): with open(self._input) as f: for record in self._process_file(f): yield record - elif hasattr(self._input, '__iter__'): + elif hasattr(self._input, "__iter__"): for el in self._input: if isinstance(el, str): with open(el) as f: for record in self._process_file(f): yield record - elif hasattr(el, '__read__') or hasattr(el, 'read'): + elif hasattr(el, "__read__") or hasattr(el, "read"): for record in self._process_file(f): yield record def _process_file(self, file): - raise NotImplementedError('Descendants of FileSource should implement' - ' a custom _process_file method.') + raise NotImplementedError( + "Descendants of FileSource should implement" + " a custom _process_file method." + ) class JSONSource(FileSource): - """ Source for reading from JSON files. + """Source for reading from JSON files. - When processing JSON files, if the top-level object is a list, will - yield each member separately. Otherwise, yields the top-level - object. + When processing JSON files, if the top-level object is a list, will + yield each member separately. Otherwise, yields the top-level + object. """ def _process_file(self, f): @@ -271,36 +277,37 @@ class JSONSource(FileSource): else: yield obj -class XMLSource(FileSource): - """ Source for reading from XML files. Use with the same kind of caution - that you use to approach anything written in XML. - When processing XML files, if the top-level object is a list, will - yield each member separately, unless the dotted path to a list is - included. you can also do this with a SubrecordFilter, but XML is - almost never going to be useful at the top level. +class XMLSource(FileSource): + """Source for reading from XML files. Use with the same kind of caution + that you use to approach anything written in XML. + + When processing XML files, if the top-level object is a list, will + yield each member separately, unless the dotted path to a list is + included. you can also do this with a SubrecordFilter, but XML is + almost never going to be useful at the top level. """ - def __init__(self, input, node_path=None, attr_prefix='ATTR_', - postprocessor=None): - super(XMLSource, self).__init__(input) - self.node_list = node_path.split('.') + def __init__(self, input, node_path=None, attr_prefix="ATTR_", postprocessor=None): + super().__init__(input) + self.node_list = node_path.split(".") self.attr_prefix = attr_prefix self.postprocessor = postprocessor - def _process_file(self, f, attr_prefix='ATTR_'): + def _process_file(self, f, attr_prefix="ATTR_"): """xmltodict can either return attributes of nodes as prefixed fields - (prefixes to avoid key collisions), or ignore them altogether. + (prefixes to avoid key collisions), or ignore them altogether. - set attr prefix to whatever you want. Setting it to False ignores - attributes. + set attr prefix to whatever you want. Setting it to False ignores + attributes. """ import xmltodict if self.postprocessor: - obj = xmltodict.parse(f, attr_prefix=self.attr_prefix, - postprocessor=self.postprocessor) + obj = xmltodict.parse( + f, attr_prefix=self.attr_prefix, postprocessor=self.postprocessor + ) else: obj = xmltodict.parse(f, attr_prefix=self.attr_prefix) @@ -308,7 +315,7 @@ class XMLSource(FileSource): if self.node_list: for node in self.node_list: - obj = obj[node] + obj = obj[node] # If the top-level XML object in the file is a list # then yield each element separately; otherwise, yield diff --git a/saucebrush/stats.py b/src/saucebrush/stats.py similarity index 58% rename from saucebrush/stats.py rename to src/saucebrush/stats.py index a827538..dc88c10 100644 --- a/saucebrush/stats.py +++ b/src/saucebrush/stats.py @@ -1,22 +1,22 @@ from saucebrush.filters import Filter -from saucebrush.utils import FallbackCounter import collections -import itertools import math -def _average(values): - """ Calculate the average of a list of values. - :param values: an iterable of ints or floats to average +def _average(values): + """Calculate the average of a list of values. + + :param values: an iterable of ints or floats to average """ value_count = len(values) if len(values) > 0: return sum(values) / float(value_count) -def _median(values): - """ Calculate the median of a list of values. - :param values: an iterable of ints or floats to calculate +def _median(values): + """Calculate the median of a list of values. + + :param values: an iterable of ints or floats to calculate """ count = len(values) @@ -35,14 +35,15 @@ def _median(values): else: # even number of items, return average of middle two items mid = int(count / 2) - return sum(values[mid - 1:mid + 1]) / 2.0 + return sum(values[mid - 1 : mid + 1]) / 2.0 + def _stddev(values, population=False): - """ Calculate the standard deviation and variance of a list of values. + """Calculate the standard deviation and variance of a list of values. - :param values: an iterable of ints or floats to calculate - :param population: True if values represents entire population, - False if it is a sample of the population + :param values: an iterable of ints or floats to calculate + :param population: True if values represents entire population, + False if it is a sample of the population """ avg = _average(values) @@ -54,11 +55,11 @@ def _stddev(values, population=False): # the average of the squared differences variance = sum(diffsq) / float(count) - return (math.sqrt(variance), variance) # stddev is sqrt of variance + return (math.sqrt(variance), variance) # stddev is sqrt of variance + class StatsFilter(Filter): - """ Base for all stats filters. - """ + """Base for all stats filters.""" def __init__(self, field, test=None): self._field = field @@ -70,20 +71,21 @@ class StatsFilter(Filter): return record def process_field(self, record): - raise NotImplementedError('process_field not defined in ' + - self.__class__.__name__) + raise NotImplementedError( + "process_field not defined in " + self.__class__.__name__ + ) def value(self): - raise NotImplementedError('value not defined in ' + - self.__class__.__name__) + raise NotImplementedError("value not defined in " + self.__class__.__name__) + class Sum(StatsFilter): - """ Calculate the sum of the values in a field. Field must contain either - int or float values. + """Calculate the sum of the values in a field. Field must contain either + int or float values. """ def __init__(self, field, initial=0, **kwargs): - super(Sum, self).__init__(field, **kwargs) + super().__init__(field, **kwargs) self._value = initial def process_field(self, item): @@ -92,13 +94,14 @@ class Sum(StatsFilter): def value(self): return self._value + class Average(StatsFilter): - """ Calculate the average (mean) of the values in a field. Field must - contain either int or float values. + """Calculate the average (mean) of the values in a field. Field must + contain either int or float values. """ def __init__(self, field, initial=0, **kwargs): - super(Average, self).__init__(field, **kwargs) + super().__init__(field, **kwargs) self._value = initial self._count = 0 @@ -110,15 +113,16 @@ class Average(StatsFilter): def value(self): return self._value / float(self._count) -class Median(StatsFilter): - """ Calculate the median of the values in a field. Field must contain - either int or float values. - **This filter keeps a list of field values in memory.** +class Median(StatsFilter): + """Calculate the median of the values in a field. Field must contain + either int or float values. + + **This filter keeps a list of field values in memory.** """ def __init__(self, field, **kwargs): - super(Median, self).__init__(field, **kwargs) + super().__init__(field, **kwargs) self._values = [] def process_field(self, item): @@ -128,13 +132,14 @@ class Median(StatsFilter): def value(self): return _median(self._values) + class MinMax(StatsFilter): - """ Find the minimum and maximum values in a field. Field must contain - either int or float values. + """Find the minimum and maximum values in a field. Field must contain + either int or float values. """ def __init__(self, field, **kwargs): - super(MinMax, self).__init__(field, **kwargs) + super().__init__(field, **kwargs) self._max = None self._min = None @@ -148,18 +153,19 @@ class MinMax(StatsFilter): def value(self): return (self._min, self._max) -class StandardDeviation(StatsFilter): - """ Calculate the standard deviation of the values in a field. Calling - value() will return a standard deviation for the sample. Pass - population=True to value() for the standard deviation of the - population. Convenience methods are provided for average() and - median(). Field must contain either int or float values. - **This filter keeps a list of field values in memory.** +class StandardDeviation(StatsFilter): + """Calculate the standard deviation of the values in a field. Calling + value() will return a standard deviation for the sample. Pass + population=True to value() for the standard deviation of the + population. Convenience methods are provided for average() and + median(). Field must contain either int or float values. + + **This filter keeps a list of field values in memory.** """ def __init__(self, field, **kwargs): - super(StandardDeviation, self).__init__(field, **kwargs) + super().__init__(field, **kwargs) self._values = [] def process_field(self, item): @@ -173,31 +179,29 @@ class StandardDeviation(StatsFilter): return _median(self._values) def value(self, population=False): - """ Return a tuple of (standard_deviation, variance). + """Return a tuple of (standard_deviation, variance). - :param population: True if values represents entire population, - False if values is a sample. Default: False + :param population: True if values represents entire population, + False if values is a sample. Default: False """ return _stddev(self._values, population) -class Histogram(StatsFilter): - """ Generate a basic histogram of the specified field. The value() method - returns a dict of value to occurance count mappings. The __str__ method - generates a basic and limited histogram useful for printing to the - command line. The label_length attribute determines the padding and - cut-off of the basic histogram labels. - **This filters maintains a dict of unique field values in memory.** +class Histogram(StatsFilter): + """Generate a basic histogram of the specified field. The value() method + returns a dict of value to occurance count mappings. The __str__ method + generates a basic and limited histogram useful for printing to the + command line. The label_length attribute determines the padding and + cut-off of the basic histogram labels. + + **This filters maintains a dict of unique field values in memory.** """ label_length = 6 def __init__(self, field, **kwargs): - super(Histogram, self).__init__(field, **kwargs) - if hasattr(collections, 'Counter'): - self._counter = collections.Counter() - else: - self._counter = FallbackCounter() + super().__init__(field, **kwargs) + self._counter = collections.Counter() def process_field(self, item): self._counter[self.prep_field(item)] += 1 diff --git a/saucebrush/utils.py b/src/saucebrush/utils.py similarity index 51% rename from saucebrush/utils.py rename to src/saucebrush/utils.py index f94e15d..71ab392 100644 --- a/saucebrush/utils.py +++ b/src/saucebrush/utils.py @@ -1,45 +1,46 @@ -import collections import os - -try: - from urllib.request import urlopen # attemp py3 first -except ImportError: - from urllib2 import urlopen # fallback to py2 +from urllib.request import urlopen """ General utilities used within saucebrush that may be useful elsewhere. """ + def get_django_model(dj_settings, app_label, model_name): """ - Get a django model given a settings file, app label, and model name. + Get a django model given a settings file, app label, and model name. """ from django.conf import settings + if not settings.configured: - settings.configure(DATABASE_ENGINE=dj_settings.DATABASE_ENGINE, - DATABASE_NAME=dj_settings.DATABASE_NAME, - DATABASE_USER=dj_settings.DATABASE_USER, - DATABASE_PASSWORD=dj_settings.DATABASE_PASSWORD, - DATABASE_HOST=dj_settings.DATABASE_HOST, - INSTALLED_APPS=dj_settings.INSTALLED_APPS) + settings.configure( + DATABASE_ENGINE=dj_settings.DATABASE_ENGINE, + DATABASE_NAME=dj_settings.DATABASE_NAME, + DATABASE_USER=dj_settings.DATABASE_USER, + DATABASE_PASSWORD=dj_settings.DATABASE_PASSWORD, + DATABASE_HOST=dj_settings.DATABASE_HOST, + INSTALLED_APPS=dj_settings.INSTALLED_APPS, + ) from django.db.models import get_model + return get_model(app_label, model_name) -def flatten(item, prefix='', separator='_', keys=None): - """ - Flatten nested dictionary into one with its keys concatenated together. - >>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}], - 'f':{'g':{'h':6}}}) - {'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6} +def flatten(item, prefix="", separator="_", keys=None): + """ + Flatten nested dictionary into one with its keys concatenated together. + + >>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}], + 'f':{'g':{'h':6}}}) + {'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6} """ # update dictionaries recursively if isinstance(item, dict): # don't prepend a leading _ - if prefix != '': + if prefix != "": prefix += separator retval = {} for key, value in item.items(): @@ -48,45 +49,30 @@ def flatten(item, prefix='', separator='_', keys=None): else: retval[prefix + key] = value return retval - #elif isinstance(item, (tuple, list)): + # elif isinstance(item, (tuple, list)): # return {prefix: [flatten(i, prefix, separator, keys) for i in item]} else: return {prefix: item} + def str_or_list(obj): if isinstance(obj, str): return [obj] else: return obj + # # utility classes # -class FallbackCounter(collections.defaultdict): - """ Python 2.6 does not have collections.Counter. - This is class that does the basics of what we need from Counter. - """ - def __init__(self, *args, **kwargs): - super(FallbackCounter, self).__init__(int) +class Files: + """Iterate over multiple files as a single file. Pass the paths of the + files as arguments to the class constructor: - def most_common(n=None): - - l = sorted(self.items(), - cmp=lambda x,y: cmp(x[1], y[1])) - - if n is not None: - l = l[:n] - - return l - -class Files(object): - """ Iterate over multiple files as a single file. Pass the paths of the - files as arguments to the class constructor: - - for line in Files('/path/to/file/a', '/path/to/file/b'): - pass + for line in Files('/path/to/file/a', '/path/to/file/b'): + pass """ def __init__(self, *args): @@ -111,10 +97,11 @@ class Files(object): yield line f.close() -class RemoteFile(object): - """ Stream data from a remote file. - :param url: URL to remote file +class RemoteFile: + """Stream data from a remote file. + + :param url: URL to remote file """ def __init__(self, url): @@ -126,21 +113,24 @@ class RemoteFile(object): yield line.rstrip() resp.close() -class ZippedFiles(object): - """ unpack a zipped collection of files on init. - Takes a string with file location or zipfile.ZipFile object +class ZippedFiles: + """unpack a zipped collection of files on init. - Best to wrap this in a Files() object, if the goal is to have a - linereader, as this only returns filelike objects. + Takes a string with file location or zipfile.ZipFile object - if using a ZipFile object, make sure to set mode to 'a' or 'w' in order - to use the add() function. + Best to wrap this in a Files() object, if the goal is to have a + linereader, as this only returns filelike objects. + + if using a ZipFile object, make sure to set mode to 'a' or 'w' in order + to use the add() function. """ + def __init__(self, zippedfile): import zipfile + if type(zippedfile) == str: - self._zipfile = zipfile.ZipFile(zippedfile,'a') + self._zipfile = zipfile.ZipFile(zippedfile, "a") else: self._zipfile = zippedfile self.paths = self._zipfile.namelist() @@ -152,10 +142,10 @@ class ZippedFiles(object): def add(self, path, dirname=None, arcname=None): path_base = os.path.basename(path) if dirname: - arcname = os.path.join(dirname,path_base) + arcname = os.path.join(dirname, path_base) if not arcname: arcname = path_base - self._zipfile.write(path,arcname) + self._zipfile.write(path, arcname) self.paths.append(path) def filereader(self): diff --git a/tests/test_emitters.py b/tests/test_emitters.py new file mode 100644 index 0000000..fead1e3 --- /dev/null +++ b/tests/test_emitters.py @@ -0,0 +1,107 @@ +from contextlib import closing +from io import StringIO +import os + +from saucebrush.emitters import ( + DebugEmitter, + CSVEmitter, + CountEmitter, + SqliteEmitter, + SqlDumpEmitter, +) + + +def test_debug_emitter(): + with closing(StringIO()) as output: + de = DebugEmitter(output) + list(de.attach([1, 2, 3])) + assert output.getvalue() == "1\n2\n3\n" + + +def test_count_emitter(): + + # values for test + values = [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 20, + 21, + 22, + ] + + with closing(StringIO()) as output: + + # test without of parameter + ce = CountEmitter(every=10, outfile=output, format="%(count)s records\n") + list(ce.attach(values)) + assert output.getvalue() == "10 records\n20 records\n" + ce.done() + assert output.getvalue() == "10 records\n20 records\n22 records\n" + + with closing(StringIO()) as output: + + # test with of parameter + ce = CountEmitter(every=10, outfile=output, of=len(values)) + list(ce.attach(values)) + assert output.getvalue() == "10 of 22\n20 of 22\n" + ce.done() + assert output.getvalue() == "10 of 22\n20 of 22\n22 of 22\n" + + +def test_csv_emitter(): + io = StringIO() # if Python 3.x then use StringIO + + with closing(io) as output: + ce = CSVEmitter(output, ("x", "y", "z")) + list(ce.attach([{"x": 1, "y": 2, "z": 3}, {"x": 5, "y": 5, "z": 5}])) + assert output.getvalue() == "x,y,z\r\n1,2,3\r\n5,5,5\r\n" + + +def test_sqlite_emitter(): + + import sqlite3 + import tempfile + + with closing(tempfile.NamedTemporaryFile(suffix=".db")) as f: + db_path = f.name + + sle = SqliteEmitter(db_path, "testtable", fieldnames=("a", "b", "c")) + list(sle.attach([{"a": "1", "b": "2", "c": "3"}])) + sle.done() + + with closing(sqlite3.connect(db_path)) as conn: + cur = conn.cursor() + cur.execute("""SELECT a, b, c FROM testtable""") + results = cur.fetchall() + + os.unlink(db_path) + + assert results == [("1", "2", "3")] + + +def test_sql_dump_emitter(): + + with closing(StringIO()) as bffr: + + sde = SqlDumpEmitter(bffr, "testtable", ("a", "b")) + list(sde.attach([{"a": 1, "b": "2"}])) + sde.done() + + assert bffr.getvalue() == "INSERT INTO `testtable` (`a`,`b`) VALUES (1,'2');\n" diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 0000000..71025b4 --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,355 @@ +import unittest +import types +from saucebrush.filters import ( + Filter, + YieldFilter, + FieldFilter, + SubrecordFilter, + ConditionalPathFilter, + ConditionalFilter, + FieldModifier, + FieldKeeper, + FieldRemover, + FieldMerger, + FieldAdder, + FieldCopier, + FieldRenamer, + Unique, +) + + +class DummyRecipe: + rejected_record = None + rejected_msg = None + + def reject_record(self, record, msg): + self.rejected_record = record + self.rejected_msg = msg + + +class Doubler(Filter): + def process_record(self, record): + return record * 2 + + +class OddRemover(Filter): + def process_record(self, record): + if record % 2 == 0: + return record + else: + return None # explicitly return None + + +class ListFlattener(YieldFilter): + def process_record(self, record): + for item in record: + yield item + + +class FieldDoubler(FieldFilter): + def process_field(self, item): + return item * 2 + + +class NonModifyingFieldDoubler(Filter): + def __init__(self, key): + self.key = key + + def process_record(self, record): + record = dict(record) + record[self.key] *= 2 + return record + + +class ConditionalOddRemover(ConditionalFilter): + def test_record(self, record): + # return True for even values + return record % 2 == 0 + + +class FilterTestCase(unittest.TestCase): + def _simple_data(self): + return [ + {"a": 1, "b": 2, "c": 3}, + {"a": 5, "b": 5, "c": 5}, + {"a": 1, "b": 10, "c": 100}, + ] + + def assert_filter_result(self, filter_obj, expected_data): + result = filter_obj.attach(self._simple_data()) + self.assertEqual(list(result), expected_data) + + def test_reject_record(self): + recipe = DummyRecipe() + f = Doubler() + result = f.attach([1, 2, 3], recipe=recipe) + # next has to be called for attach to take effect + next(result) + f.reject_record("bad", "this one was bad") + + # ensure that the rejection propagated to the recipe + self.assertEqual("bad", recipe.rejected_record) + self.assertEqual("this one was bad", recipe.rejected_msg) + + def test_simple_filter(self): + df = Doubler() + result = df.attach([1, 2, 3]) + + # ensure we got a generator that yields 2,4,6 + self.assertEqual(type(result), types.GeneratorType) + self.assertEqual(list(result), [2, 4, 6]) + + def test_simple_filter_return_none(self): + cf = OddRemover() + result = cf.attach(range(10)) + + # ensure only even numbers remain + self.assertEqual(list(result), [0, 2, 4, 6, 8]) + + def test_simple_yield_filter(self): + lf = ListFlattener() + result = lf.attach([[1], [2, 3], [4, 5, 6]]) + + # ensure we got a generator that yields 1,2,3,4,5,6 + self.assertEqual(type(result), types.GeneratorType) + self.assertEqual(list(result), [1, 2, 3, 4, 5, 6]) + + def test_simple_field_filter(self): + ff = FieldDoubler(["a", "c"]) + + # check against expected data + expected_data = [ + {"a": 2, "b": 2, "c": 6}, + {"a": 10, "b": 5, "c": 10}, + {"a": 2, "b": 10, "c": 200}, + ] + self.assert_filter_result(ff, expected_data) + + def test_conditional_filter(self): + cf = ConditionalOddRemover() + result = cf.attach(range(10)) + + # ensure only even numbers remain + self.assertEqual(list(result), [0, 2, 4, 6, 8]) + + # Tests for Subrecord + + def test_subrecord_filter_list(self): + data = [ + {"a": [{"b": 2}, {"b": 4}]}, + {"a": [{"b": 5}]}, + {"a": [{"b": 8}, {"b": 2}, {"b": 1}]}, + ] + + expected = [ + {"a": [{"b": 4}, {"b": 8}]}, + {"a": [{"b": 10}]}, + {"a": [{"b": 16}, {"b": 4}, {"b": 2}]}, + ] + + sf = SubrecordFilter("a", NonModifyingFieldDoubler("b")) + result = sf.attach(data) + + self.assertEqual(list(result), expected) + + def test_subrecord_filter_deep(self): + data = [ + {"a": {"d": [{"b": 2}, {"b": 4}]}}, + {"a": {"d": [{"b": 5}]}}, + {"a": {"d": [{"b": 8}, {"b": 2}, {"b": 1}]}}, + ] + + expected = [ + {"a": {"d": [{"b": 4}, {"b": 8}]}}, + {"a": {"d": [{"b": 10}]}}, + {"a": {"d": [{"b": 16}, {"b": 4}, {"b": 2}]}}, + ] + + sf = SubrecordFilter("a.d", NonModifyingFieldDoubler("b")) + result = sf.attach(data) + + self.assertEqual(list(result), expected) + + def test_subrecord_filter_nonlist(self): + data = [ + {"a": {"b": {"c": 1}}}, + {"a": {"b": {"c": 2}}}, + {"a": {"b": {"c": 3}}}, + ] + + expected = [ + {"a": {"b": {"c": 2}}}, + {"a": {"b": {"c": 4}}}, + {"a": {"b": {"c": 6}}}, + ] + + sf = SubrecordFilter("a.b", NonModifyingFieldDoubler("c")) + result = sf.attach(data) + + self.assertEqual(list(result), expected) + + def test_subrecord_filter_list_in_path(self): + data = [ + {"a": [{"b": {"c": 5}}, {"b": {"c": 6}}]}, + {"a": [{"b": {"c": 1}}, {"b": {"c": 2}}, {"b": {"c": 3}}]}, + {"a": [{"b": {"c": 2}}]}, + ] + + expected = [ + {"a": [{"b": {"c": 10}}, {"b": {"c": 12}}]}, + {"a": [{"b": {"c": 2}}, {"b": {"c": 4}}, {"b": {"c": 6}}]}, + {"a": [{"b": {"c": 4}}]}, + ] + + sf = SubrecordFilter("a.b", NonModifyingFieldDoubler("c")) + result = sf.attach(data) + + self.assertEqual(list(result), expected) + + def test_conditional_path(self): + + predicate = lambda r: r["a"] == 1 # noqa + + # double b if a == 1, otherwise double c + cpf = ConditionalPathFilter(predicate, FieldDoubler("b"), FieldDoubler("c")) + expected_data = [ + {"a": 1, "b": 4, "c": 3}, + {"a": 5, "b": 5, "c": 10}, + {"a": 1, "b": 20, "c": 100}, + ] + + self.assert_filter_result(cpf, expected_data) + + # Tests for Generic Filters + + def test_field_modifier(self): + # another version of FieldDoubler + fm = FieldModifier(["a", "c"], lambda x: x * 2) + + # check against expected data + expected_data = [ + {"a": 2, "b": 2, "c": 6}, + {"a": 10, "b": 5, "c": 10}, + {"a": 2, "b": 10, "c": 200}, + ] + self.assert_filter_result(fm, expected_data) + + def test_field_keeper(self): + fk = FieldKeeper(["c"]) + + # check against expected results + expected_data = [{"c": 3}, {"c": 5}, {"c": 100}] + self.assert_filter_result(fk, expected_data) + + def test_field_remover(self): + fr = FieldRemover(["a", "b"]) + + # check against expected results + expected_data = [{"c": 3}, {"c": 5}, {"c": 100}] + self.assert_filter_result(fr, expected_data) + + def test_field_merger(self): + fm = FieldMerger({"sum": ("a", "b", "c")}, lambda x, y, z: x + y + z) + + # check against expected results + expected_data = [{"sum": 6}, {"sum": 15}, {"sum": 111}] + self.assert_filter_result(fm, expected_data) + + def test_field_merger_keep_fields(self): + fm = FieldMerger( + {"sum": ("a", "b", "c")}, lambda x, y, z: x + y + z, keep_fields=True + ) + + # check against expected results + expected_data = [ + {"a": 1, "b": 2, "c": 3, "sum": 6}, + {"a": 5, "b": 5, "c": 5, "sum": 15}, + {"a": 1, "b": 10, "c": 100, "sum": 111}, + ] + self.assert_filter_result(fm, expected_data) + + def test_field_adder_scalar(self): + fa = FieldAdder("x", 7) + + expected_data = [ + {"a": 1, "b": 2, "c": 3, "x": 7}, + {"a": 5, "b": 5, "c": 5, "x": 7}, + {"a": 1, "b": 10, "c": 100, "x": 7}, + ] + self.assert_filter_result(fa, expected_data) + + def test_field_adder_callable(self): + fa = FieldAdder("x", lambda: 7) + + expected_data = [ + {"a": 1, "b": 2, "c": 3, "x": 7}, + {"a": 5, "b": 5, "c": 5, "x": 7}, + {"a": 1, "b": 10, "c": 100, "x": 7}, + ] + self.assert_filter_result(fa, expected_data) + + def test_field_adder_iterable(self): + fa = FieldAdder("x", [1, 2, 3]) + + expected_data = [ + {"a": 1, "b": 2, "c": 3, "x": 1}, + {"a": 5, "b": 5, "c": 5, "x": 2}, + {"a": 1, "b": 10, "c": 100, "x": 3}, + ] + self.assert_filter_result(fa, expected_data) + + def test_field_adder_replace(self): + fa = FieldAdder("b", lambda: 7) + + expected_data = [ + {"a": 1, "b": 7, "c": 3}, + {"a": 5, "b": 7, "c": 5}, + {"a": 1, "b": 7, "c": 100}, + ] + self.assert_filter_result(fa, expected_data) + + def test_field_adder_no_replace(self): + fa = FieldAdder("b", lambda: 7, replace=False) + + expected_data = [ + {"a": 1, "b": 2, "c": 3}, + {"a": 5, "b": 5, "c": 5}, + {"a": 1, "b": 10, "c": 100}, + ] + self.assert_filter_result(fa, expected_data) + + def test_field_copier(self): + fc = FieldCopier({"a2": "a", "b2": "b"}) + + expected_data = [ + {"a": 1, "b": 2, "c": 3, "a2": 1, "b2": 2}, + {"a": 5, "b": 5, "c": 5, "a2": 5, "b2": 5}, + {"a": 1, "b": 10, "c": 100, "a2": 1, "b2": 10}, + ] + self.assert_filter_result(fc, expected_data) + + def test_field_renamer(self): + fr = FieldRenamer({"x": "a", "y": "b"}) + + expected_data = [ + {"x": 1, "y": 2, "c": 3}, + {"x": 5, "y": 5, "c": 5}, + {"x": 1, "y": 10, "c": 100}, + ] + self.assert_filter_result(fr, expected_data) + + # TODO: splitter & flattner tests? + + def test_unique_filter(self): + u = Unique() + in_data = [{"a": 77}, {"a": 33}, {"a": 77}] + expected_data = [{"a": 77}, {"a": 33}] + result = u.attach(in_data) + + self.assertEqual(list(result), expected_data) + + # TODO: unicode & string filter tests + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_recipes.py b/tests/test_recipes.py new file mode 100644 index 0000000..3a31c6f --- /dev/null +++ b/tests/test_recipes.py @@ -0,0 +1,49 @@ +import pytest +from saucebrush import Recipe, run_recipe, SaucebrushError, OvercookedError +from saucebrush.filters import Filter + + +class Raiser(Filter): + def process_record(self, record): + raise Exception("bad record") + + +class Saver(Filter): + def __init__(self): + self.saved = [] + + def process_record(self, record): + self.saved.append(record) + return record + + +def test_error_stream(): + saver = Saver() + recipe = Recipe(Raiser(), error_stream=saver) + recipe.run([{"a": 1}, {"b": 2}]) + recipe.done() + + assert saver.saved[0]["record"] == {"a": 1} + assert saver.saved[1]["record"] == {"b": 2} + + # Must pass either a Recipe, a Filter or an iterable of Filters + # as the error_stream argument + assert pytest.raises(SaucebrushError, Recipe, error_stream=5) + + +def test_run_recipe(): + saver = Saver() + run_recipe([1, 2], saver) + + assert saver.saved == [1, 2] + + +def test_done(): + saver = Saver() + recipe = Recipe(saver) + recipe.run([1]) + recipe.done() + + assert pytest.raises(OvercookedError, recipe.run, [2]) + assert pytest.raises(OvercookedError, recipe.done) + assert saver.saved == [1] diff --git a/tests/test_sources.py b/tests/test_sources.py new file mode 100644 index 0000000..da6fc1d --- /dev/null +++ b/tests/test_sources.py @@ -0,0 +1,90 @@ +from io import StringIO + +from saucebrush.sources import ( + CSVSource, + FixedWidthFileSource, + HtmlTableSource, + JSONSource, +) + + +def _get_csv(): + data = """a,b,c +1,2,3 +5,5,5 +1,10,100""" + return StringIO(data) + + +def test_csv_source_basic(): + source = CSVSource(_get_csv()) + expected_data = [ + {"a": "1", "b": "2", "c": "3"}, + {"a": "5", "b": "5", "c": "5"}, + {"a": "1", "b": "10", "c": "100"}, + ] + assert list(source) ==expected_data + + +def test_csv_source_fieldnames(): + source = CSVSource(_get_csv(), ["x", "y", "z"]) + expected_data = [ + {"x": "a", "y": "b", "z": "c"}, + {"x": "1", "y": "2", "z": "3"}, + {"x": "5", "y": "5", "z": "5"}, + {"x": "1", "y": "10", "z": "100"}, + ] + assert list(source) == expected_data + + +def test_csv_source_skiprows(): + source = CSVSource(_get_csv(), skiprows=1) + expected_data = [ + {"a": "5", "b": "5", "c": "5"}, + {"a": "1", "b": "10", "c": "100"}, + ] + assert list(source) == expected_data + + +def test_fixed_width_source(): + data = StringIO("JamesNovember 3 1986\nTim September151999") + fields = (("name", 5), ("month", 9), ("day", 2), ("year", 4)) + source = FixedWidthFileSource(data, fields) + expected_data = [ + {"name": "James", "month": "November", "day": "3", "year": "1986"}, + {"name": "Tim", "month": "September", "day": "15", "year": "1999"}, + ] + assert list(source) == expected_data + + +def test_json_source(): + + content = StringIO("""[{"a": 1, "b": "2", "c": 3}]""") + + js = JSONSource(content) + assert list(js) == [{"a": 1, "b": "2", "c": 3}] + + +def test_html_table_source(): + + content = StringIO( + """ + + + + + + + + + + + + +
abc
123
+ + """ + ) + + hts = HtmlTableSource(content, "thetable") + assert list(hts) == [{"a": "1", "b": "2", "c": "3"}] diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..6ef4eab --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,55 @@ +from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram + + +def _simple_data(): + return [ + {"a": 1, "b": 2, "c": 3}, + {"a": 5, "b": 5, "c": 5}, + {"a": 1, "b": 10, "c": 100}, + ] + + +def test_sum(): + fltr = Sum("b") + list(fltr.attach(_simple_data())) + assert fltr.value() == 17 + + +def test_average(): + fltr = Average("c") + list(fltr.attach(_simple_data())) + assert fltr.value() == 36.0 + + +def test_median(): + # odd number of values + fltr = Median("a") + list(fltr.attach(_simple_data())) + assert fltr.value() == 1 + + # even number of values + fltr = Median("a") + list(fltr.attach(_simple_data()[:2])) + assert fltr.value() == 3 + + +def test_minmax(): + fltr = MinMax("b") + list(fltr.attach(_simple_data())) + assert fltr.value() == (2, 10) + + +def test_standard_deviation(): + fltr = StandardDeviation("c") + list(fltr.attach(_simple_data())) + assert fltr.average() == 36.0 + assert fltr.median() == 5 + assert fltr.value() == (55.4346462061408, 3073.0) + assert fltr.value(True) == (45.2621990922521, 2048.6666666666665) + + +def test_histogram(): + fltr = Histogram("a") + fltr.label_length = 1 + list(fltr.attach(_simple_data())) + assert str(fltr) == "\n1 **\n5 *\n"