Compare commits

..

No commits in common. "main" and "0.4.0" have entirely different histories.
main ... 0.4.0

35 changed files with 1308 additions and 2864 deletions

1
.github/FUNDING.yml vendored
View File

@ -1 +0,0 @@
github: [jamesturk]

View File

@ -1,17 +0,0 @@
---
name: Bug report
about: Create a report to help us improve
title: ""
labels: bug
assignees: ''
---
**Describe the bug**
A clear and concise description of what the bug is.
**Environment**
Please provide output of `python -V` & `spatula --version`, as well as what operating system you're using, and any other details:
**Additional context**
Add any other context about the problem here.

View File

@ -1,20 +0,0 @@
---
name: Feature request
about: Suggest an idea for this project
title: ''
labels: enhancement
assignees: ''
---
**Is your feature request related to a problem? Please describe.**
A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
**Describe the solution you'd like**
A clear and concise description of what you want to happen.
**Describe alternatives you've considered**
A clear and concise description of any alternative solutions or features you've considered.
**Additional context**
Add any other context about the feature request here.

View File

@ -1,36 +0,0 @@
name: Test & Lint
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
build:
runs-on: ubuntu-latest
strategy:
max-parallel: 4
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
steps:
# Python & dependency installation
- uses: actions/checkout@v3
- name: setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: install Poetry
uses: snok/install-poetry@v1.2.1
- name: set poetry config path
run: poetry config virtualenvs.path ~/.virtualenvs
- name: install dependencies
run: poetry install
# - name: lint with mypy
# run: poetry run mypy src
- name: lint with flake8
run: poetry run flake8 --show-source --statistics --ignore=E203,E501,W503 src
- name: pytest
run: poetry run pytest

1
.gitignore vendored
View File

@ -1,2 +1 @@
*.pyc
docs/_build

25
LICENSE
View File

@ -1,25 +0,0 @@
Copyright (c) 2015-, James Turk
Copyright (c) 2011-2015, Sunlight Labs
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

View File

@ -1,153 +0,0 @@
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
-rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/saucebrush.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/saucebrush.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/saucebrush"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/saucebrush"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."

View File

@ -1,242 +0,0 @@
# -*- coding: utf-8 -*-
#
# saucebrush documentation build configuration file, created by
# sphinx-quickstart on Sun Mar 11 14:23:51 2012.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys, os
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#sys.path.insert(0, os.path.abspath('.'))
# -- General configuration -----------------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
#needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'saucebrush'
copyright = u'2012, James Turk'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '0.5'
# The full version, including alpha/beta/rc tags.
release = '0.5.0-dev'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ['_build']
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_domain_indices = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
#html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
#html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = 'saucebrushdoc'
# -- Options for LaTeX output --------------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'saucebrush.tex', u'saucebrush Documentation',
u'James Turk', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# If true, show page references after internal links.
#latex_show_pagerefs = False
# If true, show URL addresses after external links.
#latex_show_urls = False
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_domain_indices = True
# -- Options for manual page output --------------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
('index', 'saucebrush', u'saucebrush Documentation',
[u'James Turk'], 1)
]
# If true, show URL addresses after external links.
#man_show_urls = False
# -- Options for Texinfo output ------------------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
('index', 'saucebrush', u'saucebrush Documentation',
u'James Turk', 'saucebrush', 'One line description of project.',
'Miscellaneous'),
]
# Documents to append as an appendix to all manuals.
#texinfo_appendices = []
# If false, no module index is generated.
#texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
#texinfo_show_urls = 'footnote'

View File

@ -1,43 +0,0 @@
saucebrush |release|
====================
Overview
--------
saucebrush is a tool for writing ETL pipelines in pure python.
The basic premise of saucebrush is that you write `Recipe` that can then
be applied to data. A `Recipe` is a pipeline consisting of `sources`,
`filters`, and `sinks`.
A `source` is a simple object that yields one data one piece at a time.
An example of a source might be a CSV file or database, it is also possible
to write your own sources.
A `filter` is a function that takes a single record and returns a modified
version of that record. Writing a filter is as simple as writing a function
that modifies a single record in the desired way. A fairly comprehensive
suite of common filters is also available making it possible to do common
tasks without writing any of your own filters.
An `emitter` is actually a special case `filter` that doesn't modify
the record but instead writes data out in some way. Emitters can be hooked
in anywhere in your pipeline but are typically placed at the end to
save the results of a recipe. Similarly to `sources` filters exist for most
common formats (CSV, various SQL dialects, etc.) and it is also possible
to write your own emitter.
Contents:
.. toctree::
:maxdepth: 2
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@ -3,7 +3,7 @@ from saucebrush.outputs import CSVOutput, DebugOutput
def merge_columns(datasource, mapping, merge_func):
for rowdata in datasource:
for to_col,from_cols in mapping.items():
for to_col,from_cols in mapping.iteritems():
values = [rowdata.pop(col, None) for col in from_cols]
rowdata[to_col] = reduce(merge_func, values)
yield rowdata

View File

@ -1,7 +1,7 @@
import re
import exceptions
class FECSource:
class FECSource(object):
SPLIT_CHAR = '\x1c'
FORM_FIELDS = {
@ -84,7 +84,7 @@ class FECSource:
@staticmethod
def get_form_type(rectype):
for type_re, type in FECSource.FORM_MAPPING.items():
for type_re, type in FECSource.FORM_MAPPING.iteritems():
if type_re.match(rectype):
return type

395
poetry.lock generated
View File

@ -1,395 +0,0 @@
[[package]]
name = "attrs"
version = "22.1.0"
description = "Classes Without Boilerplate"
category = "dev"
optional = false
python-versions = ">=3.5"
[package.extras]
dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy (>=0.900,!=0.940)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "sphinx", "sphinx-notfound-page", "zope.interface"]
docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "zope.interface"]
tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy (>=0.900,!=0.940)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins"]
[[package]]
name = "black"
version = "22.10.0"
description = "The uncompromising code formatter."
category = "dev"
optional = false
python-versions = ">=3.7"
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_full_version < \"3.11.0a7\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "click"
version = "8.1.3"
description = "Composable command line interface toolkit"
category = "dev"
optional = false
python-versions = ">=3.7"
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "dev"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
[[package]]
name = "cssselect"
version = "1.2.0"
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
category = "main"
optional = false
python-versions = ">=3.7"
[[package]]
name = "exceptiongroup"
version = "1.0.1"
description = "Backport of PEP 654 (exception groups)"
category = "dev"
optional = false
python-versions = ">=3.7"
[package.extras]
test = ["pytest (>=6)"]
[[package]]
name = "flake8"
version = "5.0.4"
description = "the modular source code checker: pep8 pyflakes and co"
category = "dev"
optional = false
python-versions = ">=3.6.1"
[package.dependencies]
mccabe = ">=0.7.0,<0.8.0"
pycodestyle = ">=2.9.0,<2.10.0"
pyflakes = ">=2.5.0,<2.6.0"
[[package]]
name = "iniconfig"
version = "1.1.1"
description = "iniconfig: brain-dead simple config-ini parsing"
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "lxml"
version = "4.9.1"
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
[package.extras]
cssselect = ["cssselect (>=0.7)"]
html5 = ["html5lib"]
htmlsoup = ["BeautifulSoup4"]
source = ["Cython (>=0.29.7)"]
[[package]]
name = "mccabe"
version = "0.7.0"
description = "McCabe checker, plugin for flake8"
category = "dev"
optional = false
python-versions = ">=3.6"
[[package]]
name = "mypy-extensions"
version = "0.4.3"
description = "Experimental type system extensions for programs checked with the mypy typechecker."
category = "dev"
optional = false
python-versions = "*"
[[package]]
name = "packaging"
version = "21.3"
description = "Core utilities for Python packages"
category = "dev"
optional = false
python-versions = ">=3.6"
[package.dependencies]
pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
[[package]]
name = "pathspec"
version = "0.10.1"
description = "Utility library for gitignore style pattern matching of file paths."
category = "dev"
optional = false
python-versions = ">=3.7"
[[package]]
name = "platformdirs"
version = "2.5.3"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "dev"
optional = false
python-versions = ">=3.7"
[package.extras]
docs = ["furo (>=2022.9.29)", "proselint (>=0.13)", "sphinx (>=5.3)", "sphinx-autodoc-typehints (>=1.19.4)"]
test = ["appdirs (==1.4.4)", "pytest (>=7.2)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"]
[[package]]
name = "pluggy"
version = "1.0.0"
description = "plugin and hook calling mechanisms for python"
category = "dev"
optional = false
python-versions = ">=3.6"
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pycodestyle"
version = "2.9.1"
description = "Python style guide checker"
category = "dev"
optional = false
python-versions = ">=3.6"
[[package]]
name = "pyflakes"
version = "2.5.0"
description = "passive checker of Python programs"
category = "dev"
optional = false
python-versions = ">=3.6"
[[package]]
name = "pyparsing"
version = "3.0.9"
description = "pyparsing module - Classes and methods to define and execute parsing grammars"
category = "dev"
optional = false
python-versions = ">=3.6.8"
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "pytest"
version = "7.2.0"
description = "pytest: simple powerful testing with Python"
category = "dev"
optional = false
python-versions = ">=3.7"
[package.dependencies]
attrs = ">=19.2.0"
colorama = {version = "*", markers = "sys_platform == \"win32\""}
exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
[[package]]
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
category = "dev"
optional = false
python-versions = ">=3.7"
[metadata]
lock-version = "1.1"
python-versions = "^3.10"
content-hash = "765977e700b56e9b852f6ca6f5d54e2c1343b3a07b9220e83ef969a277f67866"
[metadata.files]
attrs = [
{file = "attrs-22.1.0-py2.py3-none-any.whl", hash = "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"},
{file = "attrs-22.1.0.tar.gz", hash = "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6"},
]
black = [
{file = "black-22.10.0-1fixedarch-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5cc42ca67989e9c3cf859e84c2bf014f6633db63d1cbdf8fdb666dcd9e77e3fa"},
{file = "black-22.10.0-1fixedarch-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:5d8f74030e67087b219b032aa33a919fae8806d49c867846bfacde57f43972ef"},
{file = "black-22.10.0-1fixedarch-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:197df8509263b0b8614e1df1756b1dd41be6738eed2ba9e9769f3880c2b9d7b6"},
{file = "black-22.10.0-1fixedarch-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:2644b5d63633702bc2c5f3754b1b475378fbbfb481f62319388235d0cd104c2d"},
{file = "black-22.10.0-1fixedarch-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:e41a86c6c650bcecc6633ee3180d80a025db041a8e2398dcc059b3afa8382cd4"},
{file = "black-22.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2039230db3c6c639bd84efe3292ec7b06e9214a2992cd9beb293d639c6402edb"},
{file = "black-22.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14ff67aec0a47c424bc99b71005202045dc09270da44a27848d534600ac64fc7"},
{file = "black-22.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:819dc789f4498ecc91438a7de64427c73b45035e2e3680c92e18795a839ebb66"},
{file = "black-22.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b9b29da4f564ba8787c119f37d174f2b69cdfdf9015b7d8c5c16121ddc054ae"},
{file = "black-22.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8b49776299fece66bffaafe357d929ca9451450f5466e997a7285ab0fe28e3b"},
{file = "black-22.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:21199526696b8f09c3997e2b4db8d0b108d801a348414264d2eb8eb2532e540d"},
{file = "black-22.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e464456d24e23d11fced2bc8c47ef66d471f845c7b7a42f3bd77bf3d1789650"},
{file = "black-22.10.0-cp37-cp37m-win_amd64.whl", hash = "sha256:9311e99228ae10023300ecac05be5a296f60d2fd10fff31cf5c1fa4ca4b1988d"},
{file = "black-22.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fba8a281e570adafb79f7755ac8721b6cf1bbf691186a287e990c7929c7692ff"},
{file = "black-22.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:915ace4ff03fdfff953962fa672d44be269deb2eaf88499a0f8805221bc68c87"},
{file = "black-22.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:444ebfb4e441254e87bad00c661fe32df9969b2bf224373a448d8aca2132b395"},
{file = "black-22.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:974308c58d057a651d182208a484ce80a26dac0caef2895836a92dd6ebd725e0"},
{file = "black-22.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72ef3925f30e12a184889aac03d77d031056860ccae8a1e519f6cbb742736383"},
{file = "black-22.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:432247333090c8c5366e69627ccb363bc58514ae3e63f7fc75c54b1ea80fa7de"},
{file = "black-22.10.0-py3-none-any.whl", hash = "sha256:c957b2b4ea88587b46cf49d1dc17681c1e672864fd7af32fc1e9664d572b3458"},
{file = "black-22.10.0.tar.gz", hash = "sha256:f513588da599943e0cde4e32cc9879e825d58720d6557062d1098c5ad80080e1"},
]
click = [
{file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
{file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
]
colorama = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
cssselect = [
{file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"},
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
]
exceptiongroup = [
{file = "exceptiongroup-1.0.1-py3-none-any.whl", hash = "sha256:4d6c0aa6dd825810941c792f53d7b8d71da26f5e5f84f20f9508e8f2d33b140a"},
{file = "exceptiongroup-1.0.1.tar.gz", hash = "sha256:73866f7f842ede6cb1daa42c4af078e2035e5f7607f0e2c762cc51bb31bbe7b2"},
]
flake8 = [
{file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"},
{file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"},
]
iniconfig = [
{file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
{file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
]
lxml = [
{file = "lxml-4.9.1-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed"},
{file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc"},
{file = "lxml-4.9.1-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc"},
{file = "lxml-4.9.1-cp27-cp27m-win32.whl", hash = "sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3"},
{file = "lxml-4.9.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627"},
{file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84"},
{file = "lxml-4.9.1-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837"},
{file = "lxml-4.9.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad"},
{file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5"},
{file = "lxml-4.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8"},
{file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8"},
{file = "lxml-4.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d"},
{file = "lxml-4.9.1-cp310-cp310-win32.whl", hash = "sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7"},
{file = "lxml-4.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b"},
{file = "lxml-4.9.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d"},
{file = "lxml-4.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3"},
{file = "lxml-4.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29"},
{file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d"},
{file = "lxml-4.9.1-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318"},
{file = "lxml-4.9.1-cp35-cp35m-win32.whl", hash = "sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7"},
{file = "lxml-4.9.1-cp35-cp35m-win_amd64.whl", hash = "sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4"},
{file = "lxml-4.9.1-cp36-cp36m-macosx_10_15_x86_64.whl", hash = "sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b"},
{file = "lxml-4.9.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf"},
{file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3"},
{file = "lxml-4.9.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391"},
{file = "lxml-4.9.1-cp36-cp36m-win32.whl", hash = "sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e"},
{file = "lxml-4.9.1-cp36-cp36m-win_amd64.whl", hash = "sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7"},
{file = "lxml-4.9.1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3"},
{file = "lxml-4.9.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca"},
{file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785"},
{file = "lxml-4.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785"},
{file = "lxml-4.9.1-cp37-cp37m-win32.whl", hash = "sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a"},
{file = "lxml-4.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e"},
{file = "lxml-4.9.1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130"},
{file = "lxml-4.9.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715"},
{file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036"},
{file = "lxml-4.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387"},
{file = "lxml-4.9.1-cp38-cp38-win32.whl", hash = "sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94"},
{file = "lxml-4.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345"},
{file = "lxml-4.9.1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91"},
{file = "lxml-4.9.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000"},
{file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25"},
{file = "lxml-4.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd"},
{file = "lxml-4.9.1-cp39-cp39-win32.whl", hash = "sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb"},
{file = "lxml-4.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d"},
{file = "lxml-4.9.1-pp37-pypy37_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c"},
{file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b"},
{file = "lxml-4.9.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc"},
{file = "lxml-4.9.1-pp38-pypy38_pp73-macosx_10_15_x86_64.whl", hash = "sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b"},
{file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2"},
{file = "lxml-4.9.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73"},
{file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c"},
{file = "lxml-4.9.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9"},
{file = "lxml-4.9.1.tar.gz", hash = "sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f"},
]
mccabe = [
{file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"},
{file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
]
mypy-extensions = [
{file = "mypy_extensions-0.4.3-py2.py3-none-any.whl", hash = "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d"},
{file = "mypy_extensions-0.4.3.tar.gz", hash = "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"},
]
packaging = [
{file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
{file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
]
pathspec = [
{file = "pathspec-0.10.1-py3-none-any.whl", hash = "sha256:46846318467efc4556ccfd27816e004270a9eeeeb4d062ce5e6fc7a87c573f93"},
{file = "pathspec-0.10.1.tar.gz", hash = "sha256:7ace6161b621d31e7902eb6b5ae148d12cfd23f4a249b9ffb6b9fee12084323d"},
]
platformdirs = [
{file = "platformdirs-2.5.3-py3-none-any.whl", hash = "sha256:0cb405749187a194f444c25c82ef7225232f11564721eabffc6ec70df83b11cb"},
{file = "platformdirs-2.5.3.tar.gz", hash = "sha256:6e52c21afff35cb659c6e52d8b4d61b9bd544557180440538f255d9382c8cbe0"},
]
pluggy = [
{file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
{file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
]
pycodestyle = [
{file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"},
{file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"},
]
pyflakes = [
{file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"},
{file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"},
]
pyparsing = [
{file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"},
{file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"},
]
pytest = [
{file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
{file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"},
]
tomli = [
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]

View File

@ -1,22 +0,0 @@
[tool.poetry]
name = "saucebrush"
version = "0.6.0"
description = ""
authors = ["James Turk <dev@jamesturk.net>"]
license = "MIT"
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
lxml = "^4.9.1"
cssselect = "^1.2.0"
[tool.poetry.group.dev.dependencies]
pytest = "^7.2.0"
flake8 = "^5.0.4"
black = "^22.10.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

View File

@ -2,7 +2,7 @@
Saucebrush is a data loading & manipulation framework written in python.
"""
from . import filters, emitters, sources, utils # noqa
import filters, emitters, sources, utils
class SaucebrushError(Exception):
@ -13,39 +13,39 @@ class OvercookedError(Exception):
"""
Exception for trying to operate on a Recipe that has been finished.
"""
pass
class Recipe:
class Recipe(object):
def __init__(self, *filter_args, **kwargs):
self.finished = False
self.filters = []
for filter in filter_args:
if hasattr(filter, "filters"):
if hasattr(filter, 'filters'):
self.filters.extend(filter.filters)
else:
self.filters.append(filter)
self.error_stream = kwargs.get("error_stream")
self.error_stream = kwargs.get('error_stream')
if self.error_stream and not isinstance(self.error_stream, Recipe):
if isinstance(self.error_stream, filters.Filter):
self.error_stream = Recipe(self.error_stream)
elif hasattr(self.error_stream, "__iter__"):
elif hasattr(self.error_stream, '__iter__'):
self.error_stream = Recipe(*self.error_stream)
else:
raise SaucebrushError(
"error_stream must be either a filter" " or an iterable of filters"
)
raise SaucebrushError('error_stream must be either a filter'
' or an iterable of filters')
def reject_record(self, record, exception):
if self.error_stream:
self.error_stream.run([{"record": record, "exception": repr(exception)}])
self.error_stream.run([{'record': record,
'exception': repr(exception)}])
def run(self, source):
if self.finished:
raise OvercookedError("run() called on finished recipe")
raise OvercookedError('run() called on finished recipe')
# connect datapath
data = source
@ -58,7 +58,7 @@ class Recipe:
def done(self):
if self.finished:
raise OvercookedError("done() called on finished recipe")
raise OvercookedError('done() called on finished recipe')
self.finished = True
@ -74,7 +74,8 @@ class Recipe:
def run_recipe(source, *filter_args, **kwargs):
"""Process data, taking it from a source and applying any number of filters"""
""" Process data, taking it from a source and applying any number of filters
"""
r = Recipe(*filter_args, **kwargs)
r.run(source)

248
saucebrush/emitters.py Normal file
View File

@ -0,0 +1,248 @@
"""
Saucebrush Emitters are filters that instead of modifying the record, output
it in some manner.
"""
from saucebrush.filters import Filter
class Emitter(Filter):
""" ABC for emitters
All derived emitters must provide an emit_record(self, record) that
takes a single record (python dictionary).
Emitters can optionally define a done() method that is called after
all records are processed (allowing database flushes, or printing of
aggregate data).
"""
def process_record(self, record):
self.emit_record(record)
return record
def emit_record(self, record):
""" Abstract method to be overridden.
Called with a single record, should "emit" the record unmodified.
"""
raise NotImplementedError('emit_record not defined in ' +
self.__class__.__name__)
def done(self):
""" No-op Method to be overridden.
Called when all processing is complete
"""
pass
class DebugEmitter(Emitter):
""" Emitter that prints raw records to a file, useful for debugging.
DebugEmitter() by default prints to stdout.
DebugEmitter(open('test', 'w')) would print to a file named test
"""
def __init__(self, outfile=None):
super(DebugEmitter, self).__init__()
if not outfile:
import sys
self._outfile = sys.stdout
else:
self._outfile = outfile
def emit_record(self, record):
self._outfile.write(str(record) + '\n')
class CountEmitter(Emitter):
""" Emitter that writes the record count to a file-like object.
CountEmitter() by default writes to stdout.
CountEmitter(outfile=open('text', 'w')) would print to a file name test.
CountEmitter(every=1000000) would write the count every 1,000,000 records.
"""
def __init__(self, every=1000, outfile=None, format=None):
super(CountEmitter, self).__init__()
if not outfile:
import sys
self._outfile = sys.stdout
else:
self._outfile = outfile
self._format = "%s\n" if format is None else format
self._every = every
self.count = 0
def emit_record(self, record):
self.count += 1
if self.count % self._every == 0:
self._outfile.write(self._format % self.count)
def done(self):
self._outfile.write(self._format % self.count)
class CSVEmitter(Emitter):
""" Emitter that writes records to a CSV file.
CSVEmitter(open('output.csv','w'), ('id', 'name', 'phone')) writes all
records to a csvfile with the columns in the order specified.
"""
def __init__(self, csvfile, fieldnames):
super(CSVEmitter, self).__init__()
import csv
self._dictwriter = csv.DictWriter(csvfile, fieldnames)
# write header row
self._dictwriter.writerow(dict(zip(fieldnames, fieldnames)))
def emit_record(self, record):
self._dictwriter.writerow(record)
class SqliteEmitter(Emitter):
""" Emitter that writes records to a SQLite database.
SqliteEmitter('addressbook.db', 'friend') writes all records to the
friends table in the SQLite database named addressbook.db
(To have the emitter create the table, the fieldnames should be passed
as a third parameter to SqliteEmitter.)
"""
def __init__(self, dbname, table_name, fieldnames=None, replace=False, quiet=False):
super(SqliteEmitter, self).__init__()
import sqlite3
self._conn = sqlite3.connect(dbname)
self._cursor = self._conn.cursor()
self._table_name = table_name
self._replace = replace
self._quiet = quiet
if fieldnames:
create = "CREATE TABLE IF NOT EXISTS %s (%s)" % (table_name,
', '.join([' '.join((field, 'TEXT')) for field in fieldnames]))
self._cursor.execute(create)
def emit_record(self, record):
import sqlite3
# input should be escaped with ? if data isn't trusted
qmarks = ','.join(('?',) * len(record))
insert = 'INSERT OR REPLACE' if self._replace else 'INSERT'
insert = '%s INTO %s (%s) VALUES (%s)' % (insert, self._table_name,
','.join(record.keys()),
qmarks)
try:
self._cursor.execute(insert, record.values())
except sqlite3.IntegrityError, ie:
if not self._quiet:
raise ie
self.reject_record(record, ie.message)
def done(self):
self._conn.commit()
self._conn.close()
class SqlDumpEmitter(Emitter):
""" Emitter that writes SQL INSERT statements.
The output generated by the SqlDumpEmitter is intended to be used to
populate a mySQL database.
SqlDumpEmitter(open('addresses.sql', 'w'), 'friend', ('name', 'phone'))
writes statements to addresses.sql to insert the data
into the friends table.
"""
def __init__(self, outfile, table_name, fieldnames):
super(SqlDumpEmitter, self).__init__()
self._fieldnames = fieldnames
if not outfile:
import sys
self._outfile = sys.stderr
else:
self._outfile = outfile
self._insert_str = "INSERT INTO `%s` (`%s`) VALUES (%%s);\n" % (
table_name, '`,`'.join(fieldnames))
def quote(self, item):
if item is None:
return "null"
elif isinstance(item, (unicode, str)):
item = item.replace("\\","\\\\").replace("'","\\'").replace(chr(0),'0')
return "'%s'" % item
else:
return "%s" % item
def emit_record(self, record):
quoted_data = [self.quote(record[field]) for field in self._fieldnames]
self._outfile.write(self._insert_str % ','.join(quoted_data))
def done(self):
self._outfile.close()
class DjangoModelEmitter(Emitter):
""" Emitter that populates a table corresponding to a django model.
Takes a django settings file, app label and model name and uses django
to insert the records into the appropriate table.
DjangoModelEmitter('settings.py', 'addressbook', 'friend') writes
records to addressbook.models.friend model using database settings
from settings.py.
"""
def __init__(self, dj_settings, app_label, model_name):
super(DjangoModelEmitter, self).__init__()
from saucebrush.utils import get_django_model
self._dbmodel = get_django_model(dj_settings, app_label, model_name)
if not self._dbmodel:
raise Exception("No such model: %s %s" % (app_label, model_name))
def emit_record(self, record):
self._dbmodel.objects.create(**record)
class MongoDBEmitter(Emitter):
""" Emitter that creates a document in a MongoDB datastore
The names of the database and collection in which the records will
be inserted are required parameters. The host and port are optional,
defaulting to 'localhost' and 27017, repectively.
"""
def __init__(self, database, collection, host='localhost', port=27017, drop_collection=False, conn=None):
super(MongoDBEmitter, self).__init__()
from pymongo.database import Database
if not isinstance(database, Database):
if not conn:
from pymongo.connection import Connection
conn = Connection(host, port)
db = conn[database]
else:
db = database
if drop_collection:
db.drop_collection(collection)
self.collection = db[collection]
def emit_record(self, record):
self.collection.insert(record)
class LoggingEmitter(Emitter):
""" Emitter that logs to a Python logging.Logger instance.
The msg_template will be passed the record being emitted as
a format parameter. The resulting message will get logged
at the provided level.
"""
import logging
def __init__(self, logger, msg_template, level=logging.DEBUG):
super(LoggingEmitter, self).__init__()
self.logger = logger
self.msg_template = msg_template
self.level = level
def emit_record(self, record):
self.logger.log(self.level, self.msg_template % record)

View File

@ -12,11 +12,10 @@ import re
import time
######################
# Abstract Filters #
## Abstract Filters ##
######################
class Filter:
class Filter(object):
""" ABC for filters that operate on records.
All derived filters must provide a process_record(self, record) that
@ -28,12 +27,11 @@ class Filter:
Called with a single record, should return modified record.
"""
raise NotImplementedError(
"process_record not defined in " + self.__class__.__name__
)
raise NotImplementedError('process_record not defined in ' +
self.__class__.__name__)
def reject_record(self, record, exception):
recipe = getattr(self, "_recipe")
recipe = getattr(self, '_recipe')
if recipe:
recipe.reject_record(record, exception)
@ -75,7 +73,7 @@ class FieldFilter(Filter):
"""
def __init__(self, keys):
super().__init__()
super(FieldFilter, self).__init__()
self._target_keys = utils.str_or_list(keys)
def process_record(self, record):
@ -93,13 +91,11 @@ class FieldFilter(Filter):
def process_field(self, item):
""" Given a value, return the value that it should be replaced with. """
raise NotImplementedError(
"process_field not defined in " + self.__class__.__name__
)
raise NotImplementedError('process_field not defined in ' +
self.__class__.__name__)
def __unicode__(self):
return "%s( %s )" % (self.__class__.__name__, str(self._target_keys))
return '%s( %s )' % (self.__class__.__name__, str(self._target_keys))
class ConditionalFilter(YieldFilter):
""" ABC for filters that only pass through records meeting a condition.
@ -124,17 +120,14 @@ class ConditionalFilter(YieldFilter):
def test_record(self, record):
""" Given a record, return True iff it should be passed on """
raise NotImplementedError(
"test_record not defined in " + self.__class__.__name__
)
raise NotImplementedError('test_record not defined in ' +
self.__class__.__name__)
class ValidationError(Exception):
def __init__(self, record):
super().__init__(repr(record))
super(ValidationError, self).__init__(repr(record))
self.record = record
def _dotted_get(d, path):
"""
utility function for SubrecordFilter
@ -142,16 +135,15 @@ def _dotted_get(d, path):
dives into a complex nested dictionary with paths like a.b.c
"""
if path:
key_pieces = path.split(".", 1)
key_pieces = path.split('.', 1)
piece = d[key_pieces[0]]
if isinstance(piece, (tuple, list)):
return [_dotted_get(i, ".".join(key_pieces[1:])) for i in piece]
return [_dotted_get(i, '.'.join(key_pieces[1:])) for i in piece]
elif isinstance(piece, (dict)):
return _dotted_get(piece, ".".join(key_pieces[1:]))
return _dotted_get(piece, '.'.join(key_pieces[1:]))
else:
return d
class SubrecordFilter(Filter):
""" Filter that calls another filter on subrecord(s) of a record
@ -160,8 +152,8 @@ class SubrecordFilter(Filter):
"""
def __init__(self, field_path, filter_):
if "." in field_path:
self.field_path, self.key = field_path.rsplit(".", 1)
if '.' in field_path:
self.field_path, self.key = field_path.rsplit('.', 1)
else:
self.field_path = None
self.key = field_path
@ -186,7 +178,6 @@ class SubrecordFilter(Filter):
self.process_subrecord(subrecord_parent)
return record
class ConditionalPathFilter(Filter):
""" Filter that uses a predicate to split input among two filter paths. """
@ -201,12 +192,10 @@ class ConditionalPathFilter(Filter):
else:
return self.false_filter.process_record(record)
#####################
# Generic Filters #
## Generic Filters ##
#####################
class FieldModifier(FieldFilter):
""" Filter that calls a given function on a given set of fields.
@ -215,36 +204,15 @@ class FieldModifier(FieldFilter):
"""
def __init__(self, keys, func):
super().__init__(keys)
super(FieldModifier, self).__init__(keys)
self._filter_func = func
def process_field(self, item):
return self._filter_func(item)
def __str__(self):
return "%s( %s, %s )" % (
self.__class__.__name__,
str(self._target_keys),
str(self._filter_func),
)
class FieldKeeper(Filter):
"""Filter that removes all but the given set of fields.
FieldKeeper(('spam', 'eggs')) removes all bu tthe spam and eggs
fields from every record filtered.
"""
def __init__(self, keys):
super().__init__()
self._target_keys = utils.str_or_list(keys)
def process_record(self, record):
for key in list(record.keys()):
if key not in self._target_keys:
del record[key]
return record
def __unicode__(self):
return '%s( %s, %s )' % (self.__class__.__name__,
str(self._target_keys), str(self._filter_func))
class FieldRemover(Filter):
@ -255,7 +223,7 @@ class FieldRemover(Filter):
"""
def __init__(self, keys):
super().__init__()
super(FieldRemover, self).__init__()
self._target_keys = utils.str_or_list(keys)
def process_record(self, record):
@ -263,8 +231,8 @@ class FieldRemover(Filter):
record.pop(key, None)
return record
def __str__(self):
return "%s( %s )" % (self.__class__.__name__, str(self._target_keys))
def __unicode__(self):
return '%s( %s )' % (self.__class__.__name__, str(self._target_keys))
class FieldMerger(Filter):
@ -277,13 +245,13 @@ class FieldMerger(Filter):
"""
def __init__(self, mapping, merge_func, keep_fields=False):
super().__init__()
super(FieldMerger, self).__init__()
self._field_mapping = mapping
self._merge_func = merge_func
self._keep_fields = keep_fields
def process_record(self, record):
for to_col, from_cols in self._field_mapping.items():
for to_col, from_cols in self._field_mapping.iteritems():
if self._keep_fields:
values = [record.get(col, None) for col in from_cols]
else:
@ -291,12 +259,10 @@ class FieldMerger(Filter):
record[to_col] = self._merge_func(*values)
return record
def __str__(self):
return "%s( %s, %s )" % (
self.__class__.__name__,
def __unicode__(self):
return '%s( %s, %s )' % (self.__class__.__name__,
str(self._field_mapping),
str(self._merge_func),
)
str(self._merge_func))
class FieldAdder(Filter):
@ -313,15 +279,11 @@ class FieldAdder(Filter):
"""
def __init__(self, field_name, field_value, replace=True):
super().__init__()
super(FieldAdder, self).__init__()
self._field_name = field_name
self._field_value = field_value
if hasattr(self._field_value, "__iter__"):
value_iter = iter(self._field_value)
if hasattr(value_iter, "next"):
self._field_value = value_iter.next
else:
self._field_value = value_iter.__next__
if hasattr(self._field_value, '__iter__'):
self._field_value = iter(self._field_value).next
self._replace = replace
def process_record(self, record):
@ -333,12 +295,8 @@ class FieldAdder(Filter):
return record
def __unicode__(self):
return "%s( %s, %s )" % (
self.__class__.__name__,
self._field_name,
str(self._field_value),
)
return '%s( %s, %s )' % (self.__class__.__name__, self._field_name,
str(self._field_value))
class FieldCopier(Filter):
""" Filter that copies one field to another.
@ -346,53 +304,31 @@ class FieldCopier(Filter):
Takes a dictionary mapping destination keys to source keys.
"""
def __init__(self, copy_mapping):
super().__init__()
super(FieldCopier, self).__init__()
self._copy_mapping = copy_mapping
def process_record(self, record):
# mapping is dest:source
for dest, source in self._copy_mapping.items():
for dest, source in self._copy_mapping.iteritems():
record[dest] = record[source]
return record
class FieldRenamer(Filter):
""" Filter that renames one field to another.
Takes a dictionary mapping destination keys to source keys.
"""
def __init__(self, rename_mapping):
super().__init__()
super(FieldRenamer, self).__init__()
self._rename_mapping = rename_mapping
def process_record(self, record):
# mapping is dest:source
for dest, source in self._rename_mapping.items():
for dest, source in self._rename_mapping.iteritems():
record[dest] = record.pop(source)
return record
class FieldNameModifier(Filter):
"""Filter that calls a given function on a given set of fields.
FieldNameModifier(('spam','eggs'), abs) to call the abs method on the spam
and eggs field names in each record filtered.
"""
def __init__(self, func):
super().__init__()
self._filter_func = func
def process_record(self, record):
for source in record.keys():
dest = self._filter_func(source)
record[dest] = record.pop(source)
return record
class Splitter(Filter):
""" Filter that splits nested data into different paths.
@ -405,11 +341,11 @@ class Splitter(Filter):
"""
def __init__(self, split_mapping):
super().__init__()
super(Splitter, self).__init__()
self._split_mapping = split_mapping
def process_record(self, record):
for key, filters in self._split_mapping.items():
for key, filters in self._split_mapping.iteritems():
# if the key doesn't exist -- move on to next key
try:
@ -447,9 +383,8 @@ class Flattener(FieldFilter):
{'addresses': [{'state': 'NC', 'street': '146 shirley drive'},
{'state': 'NY', 'street': '3000 Winton Rd'}]}
"""
def __init__(self, keys):
super().__init__(keys)
super(Flattener, self).__init__(keys)
def process_field(self, item):
result = []
@ -462,8 +397,8 @@ class Flattener(FieldFilter):
class DictFlattener(Filter):
def __init__(self, keys, separator="_"):
super().__init__()
def __init__(self, keys, separator='_'):
super(DictFlattener, self).__init__()
self._keys = utils.str_or_list(keys)
self._separator = separator
@ -472,10 +407,11 @@ class DictFlattener(Filter):
class Unique(ConditionalFilter):
"""Filter that ensures that all records passing through are unique."""
""" Filter that ensures that all records passing through are unique.
"""
def __init__(self):
super().__init__()
super(Unique, self).__init__()
self._seen = set()
def test_record(self, record):
@ -486,7 +422,6 @@ class Unique(ConditionalFilter):
else:
return False
class UniqueValidator(Unique):
validator = True
@ -498,8 +433,8 @@ class UniqueID(ConditionalFilter):
of a composite ID.
"""
def __init__(self, field="id", *args):
super().__init__()
def __init__(self, field='id', *args):
super(UniqueID, self).__init__()
self._seen = set()
self._id_fields = [field]
self._id_fields.extend(args)
@ -512,15 +447,44 @@ class UniqueID(ConditionalFilter):
else:
return False
class UniqueIDValidator(UniqueID):
validator = True
###########################
# Commonly Used Filters #
###########################
class UnicodeFilter(Filter):
""" Convert all str elements in the record to Unicode.
"""
def __init__(self, encoding='utf-8', errors='ignore'):
super(UnicodeFilter, self).__init__()
self._encoding = encoding
self._errors = errors
def process_record(self, record):
for key, value in record.iteritems():
if isinstance(value, str):
record[key] = unicode(value, self._encoding, self._errors)
elif isinstance(value, unicode):
record[key] = value.decode(self._encoding, self._errors)
return record
class StringFilter(Filter):
def __init__(self, encoding='utf-8', errors='ignore'):
super(StringFilter, self).__init__()
self._encoding = encoding
self._errors = errors
def process_record(self, record):
for key, value in record.iteritems():
if isinstance(value, unicode):
record[key] = value.encode(self._encoding, self._errors)
return record
###########################
## Commonly Used Filters ##
###########################
class PhoneNumberCleaner(FieldFilter):
""" Filter that cleans phone numbers to match a given format.
@ -531,11 +495,10 @@ class PhoneNumberCleaner(FieldFilter):
PhoneNumberCleaner( ('phone','fax'), number_format='%s%s%s-%s%s%s-%s%s%s%s')
would format the phone & fax columns to 555-123-4567 format.
"""
def __init__(self, keys, number_format="%s%s%s.%s%s%s.%s%s%s%s"):
super().__init__(keys)
def __init__(self, keys, number_format='%s%s%s.%s%s%s.%s%s%s%s'):
super(PhoneNumberCleaner, self).__init__(keys)
self._number_format = number_format
self._num_re = re.compile(r"\d")
self._num_re = re.compile('\d')
def process_field(self, item):
nums = self._num_re.findall(item)
@ -543,21 +506,19 @@ class PhoneNumberCleaner(FieldFilter):
item = self._number_format % tuple(nums)
return item
class DateCleaner(FieldFilter):
""" Filter that cleans dates to match a given format.
Takes a list of target keys and to and from formats in strftime format.
"""
def __init__(self, keys, from_format, to_format):
super().__init__(keys)
super(DateCleaner, self).__init__(keys)
self._from_format = from_format
self._to_format = to_format
def process_field(self, item):
return time.strftime(self._to_format, time.strptime(item, self._from_format))
return time.strftime(self._to_format,
time.strptime(item, self._from_format))
class NameCleaner(Filter):
""" Filter that splits names into a first, last, and middle name field.
@ -570,27 +531,21 @@ class NameCleaner(Filter):
"""
# first middle? last suffix?
FIRST_LAST = re.compile(
r"""^\s*(?:(?P<firstname>\w+)(?:\.?)
FIRST_LAST = re.compile('''^\s*(?:(?P<firstname>\w+)(?:\.?)
\s+(?:(?P<middlename>\w+)\.?\s+)?
(?P<lastname>[A-Za-z'-]+))
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
\s*$""",
re.VERBOSE | re.IGNORECASE,
)
\s*$''', re.VERBOSE | re.IGNORECASE)
# last, first middle? suffix?
LAST_FIRST = re.compile(
r"""^\s*(?:(?P<lastname>[A-Za-z'-]+),
LAST_FIRST = re.compile('''^\s*(?:(?P<lastname>[A-Za-z'-]+),
\s+(?P<firstname>\w+)(?:\.?)
(?:\s+(?P<middlename>\w+)\.?)?)
(?:\s+(?P<suffix>JR\.?|II|III|IV))?
\s*$""",
re.VERBOSE | re.IGNORECASE,
)
\s*$''', re.VERBOSE | re.IGNORECASE)
def __init__(self, keys, prefix="", formats=None, nomatch_name=None):
super().__init__()
def __init__(self, keys, prefix='', formats=None, nomatch_name=None):
super(NameCleaner, self).__init__()
self._keys = utils.str_or_list(keys)
self._name_prefix = prefix
self._nomatch_name = nomatch_name
@ -611,7 +566,7 @@ class NameCleaner(Filter):
# if there is a match, remove original name and add pieces
if match:
record.pop(key)
for k, v in match.groupdict().items():
for k,v in match.groupdict().iteritems():
record[self._name_prefix + k] = v
break

259
saucebrush/sources.py Normal file
View File

@ -0,0 +1,259 @@
"""
Saucebrush data sources, convert data in some format into python dicts.
All sources must implement the iterable interface and return python
dictionaries.
"""
import string
from saucebrush import utils
class CSVSource(object):
""" Saucebrush source for reading from CSV files.
Takes an open csvfile, an optional set of fieldnames and optional number
of rows to skip.
CSVSource(open('test.csv')) will read a csvfile, using the first row as
the field names.
CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read
in a CSV file and treat the three columns as name, phone, and address,
ignoring the first row (presumed to be column names).
"""
def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs):
import csv
self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs)
for _ in xrange(skiprows):
self._dictreader.next()
def __iter__(self):
return self._dictreader
class FixedWidthFileSource(object):
""" Saucebrush source for reading from fixed width field files.
FixedWidthFileSource expects an open fixed width file and a tuple
of fields with their lengths. There is also an optional fillchars
command that is the filler characters to strip from the end of each
field. (defaults to whitespace)
FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12)))
will read in a fixed width file where the first 30 characters of each
line are part of a name and the characters 31-42 are a phone number.
"""
def __init__(self, fwfile, fields, fillchars=string.whitespace):
self._fwfile = fwfile
self._fields_dict = {}
self._fillchars = fillchars
from_offset = 0
to_offset = 0
for field, size in fields:
to_offset += size
self._fields_dict[field] = (from_offset, to_offset)
from_offset += size
def __iter__(self):
return self
def next(self):
line = self._fwfile.next()
record = {}
for name, range_ in self._fields_dict.iteritems():
record[name] = line[range_[0]:range_[1]].rstrip(self._fillchars)
return record
class HtmlTableSource(object):
""" Saucebrush source for reading data from an HTML table.
HtmlTableSource expects an open html file, the id of the table or a
number indicating which table on the page to use, an optional fieldnames
tuple, and an optional number of rows to skip.
HtmlTableSource(open('test.html'), 0) opens the first HTML table and
uses the first row as the names of the columns.
HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens
the HTML table with an id of 'people' and names the two columns
name and phone, skipping the first row where alternate names are
stored.
"""
def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
# extract the table
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(htmlfile.read())
if isinstance(id_or_num, int):
table = soup.findAll('table')[id_or_num]
elif isinstance(id_or_num, str):
table = soup.find('table', id=id_or_num)
# skip the necessary number of rows
self._rows = table.findAll('tr')[skiprows:]
# determine the fieldnames
if not fieldnames:
self._fieldnames = [td.string
for td in self._rows[0].findAll(('td','th'))]
else:
self._fieldnames = fieldnames
def process_tr(self):
for row in self._rows:
strings = [utils.string_dig(td) for td in row.findAll('td')]
yield dict(zip(self._fieldnames, strings))
def __iter__(self):
return self.process_tr()
class DjangoModelSource(object):
""" Saucebrush source for reading data from django models.
DjangoModelSource expects a django settings file, app label, and model
name. The resulting records contain all columns in the table for the
specified model.
DjangoModelSource('settings.py', 'phonebook', 'friend') would read all
friends from the friend model in the phonebook app described in
settings.py.
"""
def __init__(self, dj_settings, app_label, model_name):
dbmodel = utils.get_django_model(dj_settings, app_label, model_name)
# only get values defined in model (no extra fields from custom manager)
self._data = dbmodel.objects.values(*[f.name
for f in dbmodel._meta.fields])
def __iter__(self):
return iter(self._data)
class MongoDBSource(object):
""" Source for reading from a MongoDB database.
The record dict is populated with records matching the spec
from the specified database and collection.
"""
def __init__(self, database, collection, spec=None, host='localhost', port=27017, conn=None):
if not conn:
from pymongo.connection import Connection
conn = Connection(host, port)
self.collection = conn[database][collection]
self.spec = spec
def __iter__(self):
return self._find_spec()
def _find_spec(self):
for doc in self.collection.find(self.spec):
yield dict(doc)
# dict_factory for sqlite source
def dict_factory(cursor, row):
d = { }
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
class SqliteSource(object):
""" Source that reads from a sqlite database.
The record dict is populated with the results from the
query argument. If given, args will be passed to the query
when executed.
"""
def __init__(self, dbpath, query, args=None, conn_params=None):
import sqlite3
self._dbpath = dbpath
self._query = query
self._args = args or []
self._conn_params = conn_params or []
# setup connection
self._conn = sqlite3.connect(self._dbpath)
self._conn.row_factory = dict_factory
if self._conn_params:
for param, value in self._conn_params.iteritems():
setattr(self._conn, param, value)
def _process_query(self):
cursor = self._conn.cursor()
for row in cursor.execute(self._query, self._args):
yield row
cursor.close()
def __iter__(self):
return self._process_query()
def done(self):
self._conn.close()
class FileSource(object):
""" Base class for sources which read from one or more files.
Takes as input a file-like, a file path, a list of file-likes,
or a list of file paths.
"""
def __init__(self, input):
self._input = input
def __iter__(self):
# This method would be a lot cleaner with the proposed
# 'yield from' expression (PEP 380)
if hasattr(self._input, '__read__'):
for record in self._process_file(input):
yield record
elif isinstance(self._input, basestring):
with open(self._input) as f:
for record in self._process_file(f):
yield record
elif hasattr(self._input, '__iter__'):
for el in self._input:
if isinstance(el, basestring):
with open(el) as f:
for record in self._process_file(f):
yield record
elif hasattr(el, '__read__'):
for record in self._process_file(f):
yield record
def _process_file(self, file):
raise NotImplementedError('Descendants of FileSource should implement'
' a custom _process_file method.')
class JSONSource(FileSource):
""" Source for reading from JSON files.
When processing JSON files, if the top-level object is a list, will
yield each member separately. Otherwise, yields the top-level
object.
"""
def _process_file(self, file):
import json
obj = json.load(file)
# If the top-level JSON object in the file is a list
# then yield each element separately; otherwise, yield
# the top-level object.
if isinstance(obj, list):
for record in obj:
yield record
else:
yield obj

46
saucebrush/stats.py Normal file
View File

@ -0,0 +1,46 @@
from saucebrush.filters import Filter
class StatsFilter(Filter):
def __init__(self, field, test=None):
self._field = field
self._test = test if test else lambda x: True
def process_record(self, record):
if self._test(record):
self.process_field(record[self._field])
return record
def process_field(self, record):
raise NotImplementedError('process_field not defined in ' +
self.__class__.__name__)
def value(self):
raise NotImplementedError('value not defined in ' +
self.__class__.__name__)
class Sum(StatsFilter):
def __init__(self, field, initial=0, **kwargs):
super(Sum, self).__init__(field, **kwargs)
self._value = initial
def process_field(self, item):
self._value += item or 0
def value(self):
return self._value
class Average(StatsFilter):
def __init__(self, field, initial=0, **kwargs):
super(Average, self).__init__(field, **kwargs)
self._value = initial
self._count = 0
def process_field(self, item):
self._value += item or 0
self._count += 1
def value(self):
return self._value / self._count

View File

@ -0,0 +1,13 @@
import unittest
from saucebrush.tests.filters import FilterTestCase
from saucebrush.tests.sources import SourceTestCase
from saucebrush.tests.emitters import EmitterTestCase
from saucebrush.tests.recipes import RecipeTestCase
filter_suite = unittest.TestLoader().loadTestsFromTestCase(FilterTestCase)
source_suite = unittest.TestLoader().loadTestsFromTestCase(SourceTestCase)
emitter_suite = unittest.TestLoader().loadTestsFromTestCase(EmitterTestCase)
recipe_suite = unittest.TestLoader().loadTestsFromTestCase(RecipeTestCase)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,34 @@
import unittest
from cStringIO import StringIO
from saucebrush.emitters import DebugEmitter, CSVEmitter, CountEmitter
class EmitterTestCase(unittest.TestCase):
def setUp(self):
self.output = StringIO()
def test_debug_emitter(self):
de = DebugEmitter(self.output)
data = de.attach([1,2,3])
for _ in data:
pass
self.assertEquals(self.output.getvalue(), '1\n2\n3\n')
def test_csv_emitter(self):
ce = CSVEmitter(self.output, ('x','y','z'))
data = ce.attach([{'x':1,'y':2,'z':3}, {'x':5, 'y':5, 'z':5}])
for _ in data:
pass
self.assertEquals(self.output.getvalue(), 'x,y,z\r\n1,2,3\r\n5,5,5\r\n')
def test_count_emitter(self):
ce = CountEmitter(every=10, outfile=self.output, format="%s records\n")
data = ce.attach([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])
for _ in data:
pass
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n')
ce.done()
self.assertEquals(self.output.getvalue(), '10 records\n20 records\n22 records\n')
if __name__ == '__main__':
unittest.main()

296
saucebrush/tests/filters.py Normal file
View File

@ -0,0 +1,296 @@
import unittest
import operator
import types
from saucebrush.filters import (Filter, YieldFilter, FieldFilter,
SubrecordFilter, ConditionalPathFilter,
ConditionalFilter, FieldModifier,
FieldRemover, FieldMerger, FieldAdder,
FieldCopier, FieldRenamer, Unique)
class DummyRecipe(object):
rejected_record = None
rejected_msg = None
def reject_record(self, record, msg):
self.rejected_record = record
self.rejected_msg = msg
class Doubler(Filter):
def process_record(self, record):
return record*2
class OddRemover(Filter):
def process_record(self, record):
if record % 2 == 0:
return record
else:
return None # explicitly return None
class ListFlattener(YieldFilter):
def process_record(self, record):
for item in record:
yield item
class FieldDoubler(FieldFilter):
def process_field(self, item):
return item*2
class NonModifyingFieldDoubler(Filter):
def __init__(self, key):
self.key = key
def process_record(self, record):
record = dict(record)
record[self.key] *= 2
return record
class ConditionalOddRemover(ConditionalFilter):
def test_record(self, record):
# return True for even values
return record % 2 == 0
class FilterTestCase(unittest.TestCase):
def _simple_data(self):
return [{'a':1, 'b':2, 'c':3},
{'a':5, 'b':5, 'c':5},
{'a':1, 'b':10, 'c':100}]
def assert_filter_result(self, filter_obj, expected_data):
result = filter_obj.attach(self._simple_data())
self.assertEquals(list(result), expected_data)
def test_reject_record(self):
recipe = DummyRecipe()
f = Doubler()
result = f.attach([1,2,3], recipe=recipe)
result.next() # next has to be called for attach to take effect
f.reject_record('bad', 'this one was bad')
# ensure that the rejection propagated to the recipe
self.assertEquals('bad', recipe.rejected_record)
self.assertEquals('this one was bad', recipe.rejected_msg)
def test_simple_filter(self):
df = Doubler()
result = df.attach([1,2,3])
# ensure we got a generator that yields 2,4,6
self.assertEquals(type(result), types.GeneratorType)
self.assertEquals(list(result), [2,4,6])
def test_simple_filter_return_none(self):
cf = OddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEquals(list(result), [0,2,4,6,8])
def test_simple_yield_filter(self):
lf = ListFlattener()
result = lf.attach([[1],[2,3],[4,5,6]])
# ensure we got a generator that yields 1,2,3,4,5,6
self.assertEquals(type(result), types.GeneratorType)
self.assertEquals(list(result), [1,2,3,4,5,6])
def test_simple_field_filter(self):
ff = FieldDoubler(['a', 'c'])
# check against expected data
expected_data = [{'a':2, 'b':2, 'c':6},
{'a':10, 'b':5, 'c':10},
{'a':2, 'b':10, 'c':200}]
self.assert_filter_result(ff, expected_data)
def test_conditional_filter(self):
cf = ConditionalOddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEquals(list(result), [0,2,4,6,8])
### Tests for Subrecord
def test_subrecord_filter_list(self):
data = [{'a': [{'b': 2}, {'b': 4}]},
{'a': [{'b': 5}]},
{'a': [{'b': 8}, {'b':2}, {'b':1}]}]
expected = [{'a': [{'b': 4}, {'b': 8}]},
{'a': [{'b': 10}]},
{'a': [{'b': 16}, {'b':4}, {'b':2}]}]
sf = SubrecordFilter('a', NonModifyingFieldDoubler('b'))
result = sf.attach(data)
self.assertEquals(list(result), expected)
def test_subrecord_filter_deep(self):
data = [{'a': {'d':[{'b': 2}, {'b': 4}]}},
{'a': {'d':[{'b': 5}]}},
{'a': {'d':[{'b': 8}, {'b':2}, {'b':1}]}}]
expected = [{'a': {'d':[{'b': 4}, {'b': 8}]}},
{'a': {'d':[{'b': 10}]}},
{'a': {'d':[{'b': 16}, {'b':4}, {'b':2}]}}]
sf = SubrecordFilter('a.d', NonModifyingFieldDoubler('b'))
result = sf.attach(data)
self.assertEquals(list(result), expected)
def test_subrecord_filter_nonlist(self):
data = [
{'a':{'b':{'c':1}}},
{'a':{'b':{'c':2}}},
{'a':{'b':{'c':3}}},
]
expected = [
{'a':{'b':{'c':2}}},
{'a':{'b':{'c':4}}},
{'a':{'b':{'c':6}}},
]
sf = SubrecordFilter('a.b', NonModifyingFieldDoubler('c'))
result = sf.attach(data)
self.assertEquals(list(result), expected)
def test_subrecord_filter_list_in_path(self):
data = [
{'a': [{'b': {'c': 5}}, {'b': {'c': 6}}]},
{'a': [{'b': {'c': 1}}, {'b': {'c': 2}}, {'b': {'c': 3}}]},
{'a': [{'b': {'c': 2}} ]}
]
expected = [
{'a': [{'b': {'c': 10}}, {'b': {'c': 12}}]},
{'a': [{'b': {'c': 2}}, {'b': {'c': 4}}, {'b': {'c': 6}}]},
{'a': [{'b': {'c': 4}} ]}
]
sf = SubrecordFilter('a.b', NonModifyingFieldDoubler('c'))
result = sf.attach(data)
self.assertEquals(list(result), expected)
def test_conditional_path(self):
predicate = lambda r: r['a'] == 1
# double b if a == 1, otherwise double c
cpf = ConditionalPathFilter(predicate, FieldDoubler('b'),
FieldDoubler('c'))
expected_data = [{'a':1, 'b':4, 'c':3},
{'a':5, 'b':5, 'c':10},
{'a':1, 'b':20, 'c':100}]
self.assert_filter_result(cpf, expected_data)
### Tests for Generic Filters
def test_field_modifier(self):
# another version of FieldDoubler
fm = FieldModifier(['a', 'c'], lambda x: x*2)
# check against expected data
expected_data = [{'a':2, 'b':2, 'c':6},
{'a':10, 'b':5, 'c':10},
{'a':2, 'b':10, 'c':200}]
self.assert_filter_result(fm, expected_data)
def test_field_remover(self):
fr = FieldRemover(['a', 'b'])
# check against expected results
expected_data = [{'c':3}, {'c':5}, {'c':100}]
self.assert_filter_result(fr, expected_data)
def test_field_merger(self):
fm = FieldMerger({'sum':('a','b','c')}, lambda x,y,z: x+y+z)
# check against expected results
expected_data = [{'sum':6}, {'sum':15}, {'sum':111}]
self.assert_filter_result(fm, expected_data)
def test_field_merger_keep_fields(self):
fm = FieldMerger({'sum':('a','b','c')}, lambda x,y,z: x+y+z,
keep_fields=True)
# check against expected results
expected_data = [{'a':1, 'b':2, 'c':3, 'sum':6},
{'a':5, 'b':5, 'c':5, 'sum':15},
{'a':1, 'b':10, 'c':100, 'sum': 111}]
self.assert_filter_result(fm, expected_data)
def test_field_adder_scalar(self):
fa = FieldAdder('x', 7)
expected_data = [{'a':1, 'b':2, 'c':3, 'x':7},
{'a':5, 'b':5, 'c':5, 'x':7},
{'a':1, 'b':10, 'c':100, 'x': 7}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_callable(self):
fa = FieldAdder('x', lambda: 7)
expected_data = [{'a':1, 'b':2, 'c':3, 'x':7},
{'a':5, 'b':5, 'c':5, 'x':7},
{'a':1, 'b':10, 'c':100, 'x': 7}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_iterable(self):
fa = FieldAdder('x', [1,2,3])
expected_data = [{'a':1, 'b':2, 'c':3, 'x':1},
{'a':5, 'b':5, 'c':5, 'x':2},
{'a':1, 'b':10, 'c':100, 'x': 3}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_replace(self):
fa = FieldAdder('b', lambda: 7)
expected_data = [{'a':1, 'b':7, 'c':3},
{'a':5, 'b':7, 'c':5},
{'a':1, 'b':7, 'c':100}]
self.assert_filter_result(fa, expected_data)
def test_field_adder_no_replace(self):
fa = FieldAdder('b', lambda: 7, replace=False)
expected_data = [{'a':1, 'b':2, 'c':3},
{'a':5, 'b':5, 'c':5},
{'a':1, 'b':10, 'c':100}]
self.assert_filter_result(fa, expected_data)
def test_field_copier(self):
fc = FieldCopier({'a2':'a', 'b2':'b'})
expected_data = [{'a':1, 'b':2, 'c':3, 'a2':1, 'b2':2},
{'a':5, 'b':5, 'c':5, 'a2':5, 'b2':5},
{'a':1, 'b':10, 'c':100, 'a2': 1, 'b2': 10}]
self.assert_filter_result(fc, expected_data)
def test_field_renamer(self):
fr = FieldRenamer({'x':'a', 'y':'b'})
expected_data = [{'x':1, 'y':2, 'c':3},
{'x':5, 'y':5, 'c':5},
{'x':1, 'y':10, 'c':100}]
self.assert_filter_result(fr, expected_data)
# TODO: splitter & flattner tests?
def test_unique_filter(self):
u = Unique()
in_data = [{'a': 77}, {'a':33}, {'a': 77}]
expected_data = [{'a': 77}, {'a':33}]
result = u.attach(in_data)
self.assertEquals(list(result), expected_data)
# TODO: unicode & string filter tests
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,53 @@
import doctest
import unittest
from saucebrush import Recipe, run_recipe, SaucebrushError, OvercookedError
from saucebrush.filters import Filter
class Raiser(Filter):
def process_record(self, record):
raise Exception("bad record")
class Saver(Filter):
def __init__(self):
self.saved = []
def process_record(self, record):
self.saved.append(record)
return record
class RecipeTestCase(unittest.TestCase):
def test_error_stream(self):
saver = Saver()
recipe = Recipe(Raiser(), error_stream=saver)
recipe.run([{'a': 1}, {'b': 2}])
recipe.done()
self.assertEqual(saver.saved[0]['record'], {'a': 1})
self.assertEqual(saver.saved[1]['record'], {'b': 2})
# Must pass either a Recipe, a Filter or an iterable of Filters
# as the error_stream argument
self.assertRaises(SaucebrushError, Recipe, error_stream=5)
def test_run_recipe(self):
saver = Saver()
run_recipe([1, 2], saver)
self.assertEqual(saver.saved, [1, 2])
def test_done(self):
saver = Saver()
recipe = Recipe(saver)
recipe.run([1])
recipe.done()
self.assertRaises(OvercookedError, recipe.run, [2])
self.assertRaises(OvercookedError, recipe.done)
self.assertEqual(saver.saved, [1])
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,58 @@
import unittest
import cStringIO
from saucebrush.sources import CSVSource, FixedWidthFileSource
class SourceTestCase(unittest.TestCase):
def _get_csv(self):
data = '''a,b,c
1,2,3
5,5,5
1,10,100'''
return cStringIO.StringIO(data)
def test_csv_source_basic(self):
source = CSVSource(self._get_csv())
expected_data = [{'a':'1', 'b':'2', 'c':'3'},
{'a':'5', 'b':'5', 'c':'5'},
{'a':'1', 'b':'10', 'c':'100'}]
self.assertEquals(list(source), expected_data)
def test_csv_source_fieldnames(self):
source = CSVSource(self._get_csv(), ['x','y','z'])
expected_data = [{'x':'a', 'y':'b', 'z':'c'},
{'x':'1', 'y':'2', 'z':'3'},
{'x':'5', 'y':'5', 'z':'5'},
{'x':'1', 'y':'10', 'z':'100'}]
self.assertEquals(list(source), expected_data)
def test_csv_source_skiprows(self):
source = CSVSource(self._get_csv(), skiprows=1)
expected_data = [{'a':'5', 'b':'5', 'c':'5'},
{'a':'1', 'b':'10', 'c':'100'}]
self.assertEquals(list(source), expected_data)
def test_fixed_width_source(self):
data = cStringIO.StringIO('JamesNovember 3 1986\nTim September151999')
fields = (('name',5), ('month',9), ('day',2), ('year',4))
source = FixedWidthFileSource(data, fields)
expected_data = [{'name':'James', 'month':'November', 'day':'3',
'year':'1986'},
{'name':'Tim', 'month':'September', 'day':'15',
'year':'1999'}]
self.assertEquals(list(source), expected_data)
def test_fixed_width_source(self):
data = cStringIO.StringIO('JamesNovember.3.1986\nTim..September151999')
fields = (('name',5), ('month',9), ('day',2), ('year',4))
source = FixedWidthFileSource(data, fields, fillchars='.')
expected_data = [{'name':'James', 'month':'November', 'day':'3',
'year':'1986'},
{'name':'Tim', 'month':'September', 'day':'15',
'year':'1999'}]
self.assertEquals(list(source), expected_data)
if __name__ == '__main__':
unittest.main()

97
saucebrush/utils.py Normal file
View File

@ -0,0 +1,97 @@
"""
General utilities used within saucebrush that may be useful elsewhere.
"""
def get_django_model(dj_settings, app_label, model_name):
"""
Get a django model given a settings file, app label, and model name.
"""
from django.conf import settings
if not settings.configured:
settings.configure(DATABASE_ENGINE=dj_settings.DATABASE_ENGINE,
DATABASE_NAME=dj_settings.DATABASE_NAME,
DATABASE_USER=dj_settings.DATABASE_USER,
DATABASE_PASSWORD=dj_settings.DATABASE_PASSWORD,
DATABASE_HOST=dj_settings.DATABASE_HOST,
INSTALLED_APPS=dj_settings.INSTALLED_APPS)
from django.db.models import get_model
return get_model(app_label, model_name)
def string_dig(element, separator=''):
"""
Dig into BeautifulSoup HTML elements looking for inner strings.
If element resembled: <p><b>test</b><em>test</em></p>
then string_dig(element, '~') would return test~test
"""
if element.string:
return element.string
else:
return separator.join([string_dig(child)
for child in element.findAll(True)])
def flatten(item, prefix='', separator='_', keys=None):
"""
Flatten nested dictionary into one with its keys concatenated together.
>>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}],
'f':{'g':{'h':6}}})
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
"""
# update dictionaries recursively
if isinstance(item, dict):
# don't prepend a leading _
if prefix != '':
prefix += separator
retval = {}
for key, value in item.iteritems():
if (not keys) or (key in keys):
retval.update(flatten(value, prefix + key, separator, keys))
else:
retval[prefix + key] = value
return retval
#elif isinstance(item, (tuple, list)):
# return {prefix: [flatten(i, prefix, separator, keys) for i in item]}
else:
print item, prefix
return {prefix: item}
def str_or_list(obj):
if isinstance(obj, str):
return [obj]
else:
return obj
#
# utility classes
#
class Files(object):
def __init__(self, *args):
self.paths = []
for arg in args:
self.add(arg)
self.file_open_callback = None
def add(self, path):
self.paths.append(path)
def __iter__(self):
return self.linereader()
def linereader(self):
import os
for path in iter(self.paths):
if os.path.exists(path):
if self.file_open_callback:
self.file_open_callback(path)
f = open(path)
for line in f:
yield line
f.close()

7
setup.py Normal file
View File

@ -0,0 +1,7 @@
#!/usr/bin/env python
from setuptools import setup
setup(name="saucebrush",
version='0.1',
packages=['saucebrush'],
)

View File

@ -1,297 +0,0 @@
"""
Saucebrush Emitters are filters that instead of modifying the record, output
it in some manner.
"""
from saucebrush.filters import Filter
class Emitter(Filter):
"""ABC for emitters
All derived emitters must provide an emit_record(self, record) that
takes a single record (python dictionary).
Emitters can optionally define a done() method that is called after
all records are processed (allowing database flushes, or printing of
aggregate data).
"""
def process_record(self, record):
self.emit_record(record)
return record
def emit_record(self, record):
"""Abstract method to be overridden.
Called with a single record, should "emit" the record unmodified.
"""
raise NotImplementedError(
"emit_record not defined in " + self.__class__.__name__
)
def done(self):
"""No-op Method to be overridden.
Called when all processing is complete
"""
pass
class DebugEmitter(Emitter):
"""Emitter that prints raw records to a file, useful for debugging.
DebugEmitter() by default prints to stdout.
DebugEmitter(open('test', 'w')) would print to a file named test
"""
def __init__(self, outfile=None):
super().__init__()
if not outfile:
import sys
self._outfile = sys.stdout
else:
self._outfile = outfile
def emit_record(self, record):
self._outfile.write("{0}\n".format(record))
class CountEmitter(Emitter):
"""Emitter that writes the record count to a file-like object.
CountEmitter() by default writes to stdout.
CountEmitter(outfile=open('text', 'w')) would print to a file name test.
CountEmitter(every=1000000) would write the count every 1,000,000 records.
CountEmitter(every=100, of=2000) would write "<count> of 2000" every 100 records.
"""
def __init__(self, every=1000, of=None, outfile=None, format=None):
super().__init__()
if not outfile:
import sys
self._outfile = sys.stdout
else:
self._outfile = outfile
if format is None:
if of is not None:
format = "%(count)s of %(of)s\n"
else:
format = "%(count)s\n"
self._format = format
self._every = every
self._of = of
self.count = 0
def format(self):
return self._format % {"count": self.count, "of": self._of}
def emit_record(self, record):
self.count += 1
if self.count % self._every == 0:
self._outfile.write(self.format())
def done(self):
self._outfile.write(self.format())
class CSVEmitter(Emitter):
"""Emitter that writes records to a CSV file.
CSVEmitter(open('output.csv','w'), ('id', 'name', 'phone')) writes all
records to a csvfile with the columns in the order specified.
"""
def __init__(self, csvfile, fieldnames):
super().__init__()
import csv
self._dictwriter = csv.DictWriter(csvfile, fieldnames)
# write header row
header_row = dict(zip(fieldnames, fieldnames))
self._dictwriter.writerow(header_row)
def emit_record(self, record):
self._dictwriter.writerow(record)
class SqliteEmitter(Emitter):
"""Emitter that writes records to a SQLite database.
SqliteEmitter('addressbook.db', 'friend') writes all records to the
friends table in the SQLite database named addressbook.db
(To have the emitter create the table, the fieldnames should be passed
as a third parameter to SqliteEmitter.)
"""
def __init__(self, dbname, table_name, fieldnames=None, replace=False, quiet=False):
super().__init__()
import sqlite3
self._conn = sqlite3.connect(dbname)
self._cursor = self._conn.cursor()
self._table_name = table_name
self._replace = replace
self._quiet = quiet
if fieldnames:
create = "CREATE TABLE IF NOT EXISTS %s (%s)" % (
table_name,
", ".join([" ".join((field, "TEXT")) for field in fieldnames]),
)
self._cursor.execute(create)
def emit_record(self, record):
import sqlite3
# input should be escaped with ? if data isn't trusted
qmarks = ",".join(("?",) * len(record))
insert = "INSERT OR REPLACE" if self._replace else "INSERT"
insert = "%s INTO %s (%s) VALUES (%s)" % (
insert,
self._table_name,
",".join(record.keys()),
qmarks,
)
try:
self._cursor.execute(insert, list(record.values()))
except sqlite3.IntegrityError as ie:
if not self._quiet:
raise ie
self.reject_record(record, ie.message)
def done(self):
self._conn.commit()
self._conn.close()
class SqlDumpEmitter(Emitter):
"""Emitter that writes SQL INSERT statements.
The output generated by the SqlDumpEmitter is intended to be used to
populate a mySQL database.
SqlDumpEmitter(open('addresses.sql', 'w'), 'friend', ('name', 'phone'))
writes statements to addresses.sql to insert the data
into the friends table.
"""
def __init__(self, outfile, table_name, fieldnames):
super().__init__()
self._fieldnames = fieldnames
if not outfile:
import sys
self._outfile = sys.stderr
else:
self._outfile = outfile
self._insert_str = "INSERT INTO `%s` (`%s`) VALUES (%%s);\n" % (
table_name,
"`,`".join(fieldnames),
)
def quote(self, item):
if item is None:
return "null"
try:
types = (basestring,)
except NameError:
types = (str,)
if isinstance(item, types):
item = item.replace("\\", "\\\\").replace("'", "\\'").replace(chr(0), "0")
return "'%s'" % item
return "%s" % item
def emit_record(self, record):
quoted_data = [self.quote(record[field]) for field in self._fieldnames]
self._outfile.write(self._insert_str % ",".join(quoted_data))
class DjangoModelEmitter(Emitter):
"""Emitter that populates a table corresponding to a django model.
Takes a django settings file, app label and model name and uses django
to insert the records into the appropriate table.
DjangoModelEmitter('settings.py', 'addressbook', 'friend') writes
records to addressbook.models.friend model using database settings
from settings.py.
"""
def __init__(self, dj_settings, app_label, model_name):
super().__init__()
from saucebrush.utils import get_django_model
self._dbmodel = get_django_model(dj_settings, app_label, model_name)
if not self._dbmodel:
raise Exception("No such model: %s %s" % (app_label, model_name))
def emit_record(self, record):
self._dbmodel.objects.create(**record)
class MongoDBEmitter(Emitter):
"""Emitter that creates a document in a MongoDB datastore
The names of the database and collection in which the records will
be inserted are required parameters. The host and port are optional,
defaulting to 'localhost' and 27017, repectively.
"""
def __init__(
self,
database,
collection,
host="localhost",
port=27017,
drop_collection=False,
conn=None,
):
super().__init__()
from pymongo.database import Database
if not isinstance(database, Database):
if not conn:
from pymongo.connection import Connection
conn = Connection(host, port)
db = conn[database]
else:
db = database
if drop_collection:
db.drop_collection(collection)
self.collection = db[collection]
def emit_record(self, record):
self.collection.insert(record)
class LoggingEmitter(Emitter):
"""Emitter that logs to a Python logging.Logger instance.
The msg_template will be passed the record being emitted as
a format parameter. The resulting message will get logged
at the provided level.
"""
import logging
def __init__(self, logger, msg_template, level=logging.DEBUG):
super().__init__()
self.logger = logger
self.msg_template = msg_template
self.level = level
def emit_record(self, record):
self.logger.log(self.level, self.msg_template % record)

View File

@ -1,327 +0,0 @@
"""
Saucebrush data sources, convert data in some format into python dicts.
All sources must implement the iterable interface and return python
dictionaries.
"""
import string
from saucebrush import utils
class CSVSource:
"""Saucebrush source for reading from CSV files.
Takes an open csvfile, an optional set of fieldnames and optional number
of rows to skip.
CSVSource(open('test.csv')) will read a csvfile, using the first row as
the field names.
CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read
in a CSV file and treat the three columns as name, phone, and address,
ignoring the first row (presumed to be column names).
"""
def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs):
import csv
self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs)
for _ in range(skiprows):
next(self._dictreader)
def __iter__(self):
return self._dictreader
class FixedWidthFileSource:
"""Saucebrush source for reading from fixed width field files.
FixedWidthFileSource expects an open fixed width file and a tuple
of fields with their lengths. There is also an optional fillchars
command that is the filler characters to strip from the end of each
field. (defaults to whitespace)
FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12)))
will read in a fixed width file where the first 30 characters of each
line are part of a name and the characters 31-42 are a phone number.
"""
def __init__(self, fwfile, fields, fillchars=string.whitespace):
self._fwfile = fwfile
self._fields_dict = {}
self._fillchars = fillchars
from_offset = 0
to_offset = 0
for field, size in fields:
to_offset += size
self._fields_dict[field] = (from_offset, to_offset)
from_offset += size
def __iter__(self):
return self
def __next__(self):
line = next(self._fwfile)
record = {}
for name, range_ in self._fields_dict.items():
record[name] = line[range_[0] : range_[1]].rstrip(self._fillchars)
return record
class HtmlTableSource:
"""Saucebrush source for reading data from an HTML table.
HtmlTableSource expects an open html file, the id of the table or a
number indicating which table on the page to use, an optional fieldnames
tuple, and an optional number of rows to skip.
HtmlTableSource(open('test.html'), 0) opens the first HTML table and
uses the first row as the names of the columns.
HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens
the HTML table with an id of 'people' and names the two columns
name and phone, skipping the first row where alternate names are
stored.
"""
def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
# extract the table
from lxml.html import parse
doc = parse(htmlfile).getroot()
if isinstance(id_or_num, int):
table = doc.cssselect("table")[id_or_num]
else:
table = doc.cssselect("table#%s" % id_or_num)
table = table[0] # get the first table
# skip the necessary number of rows
self._rows = table.cssselect("tr")[skiprows:]
# determine the fieldnames
if not fieldnames:
self._fieldnames = [
td.text_content() for td in self._rows[0].cssselect("td, th")
]
skiprows += 1
else:
self._fieldnames = fieldnames
# skip the necessary number of rows
self._rows = table.cssselect("tr")[skiprows:]
def process_tr(self):
for row in self._rows:
strings = [td.text_content() for td in row.cssselect("td")]
yield dict(zip(self._fieldnames, strings))
def __iter__(self):
return self.process_tr()
class DjangoModelSource:
"""Saucebrush source for reading data from django models.
DjangoModelSource expects a django settings file, app label, and model
name. The resulting records contain all columns in the table for the
specified model.
DjangoModelSource('settings.py', 'phonebook', 'friend') would read all
friends from the friend model in the phonebook app described in
settings.py.
"""
def __init__(self, dj_settings, app_label, model_name):
dbmodel = utils.get_django_model(dj_settings, app_label, model_name)
# only get values defined in model (no extra fields from custom manager)
self._data = dbmodel.objects.values(*[f.name for f in dbmodel._meta.fields])
def __iter__(self):
return iter(self._data)
class MongoDBSource:
"""Source for reading from a MongoDB database.
The record dict is populated with records matching the spec
from the specified database and collection.
"""
def __init__(
self, database, collection, spec=None, host="localhost", port=27017, conn=None
):
if not conn:
from pymongo.connection import Connection
conn = Connection(host, port)
self.collection = conn[database][collection]
self.spec = spec
def __iter__(self):
return self._find_spec()
def _find_spec(self):
for doc in self.collection.find(self.spec):
yield dict(doc)
# dict_factory for sqlite source
def dict_factory(cursor, row):
d = {}
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
class SqliteSource:
"""Source that reads from a sqlite database.
The record dict is populated with the results from the
query argument. If given, args will be passed to the query
when executed.
"""
def __init__(self, dbpath, query, args=None, conn_params=None):
import sqlite3
self._dbpath = dbpath
self._query = query
self._args = args or []
self._conn_params = conn_params or []
# setup connection
self._conn = sqlite3.connect(self._dbpath)
self._conn.row_factory = dict_factory
if self._conn_params:
for param, value in self._conn_params.items():
setattr(self._conn, param, value)
def _process_query(self):
cursor = self._conn.cursor()
for row in cursor.execute(self._query, self._args):
yield row
cursor.close()
def __iter__(self):
return self._process_query()
def done(self):
self._conn.close()
class FileSource:
"""Base class for sources which read from one or more files.
Takes as input a file-like, a file path, a list of file-likes,
or a list of file paths.
"""
def __init__(self, input):
self._input = input
def __iter__(self):
# This method would be a lot cleaner with the proposed
# 'yield from' expression (PEP 380)
if hasattr(self._input, "__read__") or hasattr(self._input, "read"):
for record in self._process_file(self._input):
yield record
elif isinstance(self._input, str):
with open(self._input) as f:
for record in self._process_file(f):
yield record
elif hasattr(self._input, "__iter__"):
for el in self._input:
if isinstance(el, str):
with open(el) as f:
for record in self._process_file(f):
yield record
elif hasattr(el, "__read__") or hasattr(el, "read"):
for record in self._process_file(f):
yield record
def _process_file(self, file):
raise NotImplementedError(
"Descendants of FileSource should implement"
" a custom _process_file method."
)
class JSONSource(FileSource):
"""Source for reading from JSON files.
When processing JSON files, if the top-level object is a list, will
yield each member separately. Otherwise, yields the top-level
object.
"""
def _process_file(self, f):
import json
obj = json.load(f)
# If the top-level JSON object in the file is a list
# then yield each element separately; otherwise, yield
# the top-level object.
if isinstance(obj, list):
for record in obj:
yield record
else:
yield obj
class XMLSource(FileSource):
"""Source for reading from XML files. Use with the same kind of caution
that you use to approach anything written in XML.
When processing XML files, if the top-level object is a list, will
yield each member separately, unless the dotted path to a list is
included. you can also do this with a SubrecordFilter, but XML is
almost never going to be useful at the top level.
"""
def __init__(self, input, node_path=None, attr_prefix="ATTR_", postprocessor=None):
super().__init__(input)
self.node_list = node_path.split(".")
self.attr_prefix = attr_prefix
self.postprocessor = postprocessor
def _process_file(self, f, attr_prefix="ATTR_"):
"""xmltodict can either return attributes of nodes as prefixed fields
(prefixes to avoid key collisions), or ignore them altogether.
set attr prefix to whatever you want. Setting it to False ignores
attributes.
"""
import xmltodict
if self.postprocessor:
obj = xmltodict.parse(
f, attr_prefix=self.attr_prefix, postprocessor=self.postprocessor
)
else:
obj = xmltodict.parse(f, attr_prefix=self.attr_prefix)
# If node list was given, walk down the tree
if self.node_list:
for node in self.node_list:
obj = obj[node]
# If the top-level XML object in the file is a list
# then yield each element separately; otherwise, yield
# the top-level object.
if isinstance(obj, list):
for record in obj:
yield record
else:
yield obj

View File

@ -1,233 +0,0 @@
from saucebrush.filters import Filter
import collections
import math
def _average(values):
"""Calculate the average of a list of values.
:param values: an iterable of ints or floats to average
"""
value_count = len(values)
if len(values) > 0:
return sum(values) / float(value_count)
def _median(values):
"""Calculate the median of a list of values.
:param values: an iterable of ints or floats to calculate
"""
count = len(values)
# bail early before sorting if 0 or 1 values in list
if count == 0:
return None
elif count == 1:
return values[0]
values = sorted(values)
if count % 2 == 1:
# odd number of items, return middle value
return float(values[int(count / 2)])
else:
# even number of items, return average of middle two items
mid = int(count / 2)
return sum(values[mid - 1 : mid + 1]) / 2.0
def _stddev(values, population=False):
"""Calculate the standard deviation and variance of a list of values.
:param values: an iterable of ints or floats to calculate
:param population: True if values represents entire population,
False if it is a sample of the population
"""
avg = _average(values)
count = len(values) if population else len(values) - 1
# square the difference between each value and the average
diffsq = ((i - avg) ** 2 for i in values)
# the average of the squared differences
variance = sum(diffsq) / float(count)
return (math.sqrt(variance), variance) # stddev is sqrt of variance
class StatsFilter(Filter):
"""Base for all stats filters."""
def __init__(self, field, test=None):
self._field = field
self._test = test
def process_record(self, record):
if self._test is None or self._test(record):
self.process_field(record[self._field])
return record
def process_field(self, record):
raise NotImplementedError(
"process_field not defined in " + self.__class__.__name__
)
def value(self):
raise NotImplementedError("value not defined in " + self.__class__.__name__)
class Sum(StatsFilter):
"""Calculate the sum of the values in a field. Field must contain either
int or float values.
"""
def __init__(self, field, initial=0, **kwargs):
super().__init__(field, **kwargs)
self._value = initial
def process_field(self, item):
self._value += item or 0
def value(self):
return self._value
class Average(StatsFilter):
"""Calculate the average (mean) of the values in a field. Field must
contain either int or float values.
"""
def __init__(self, field, initial=0, **kwargs):
super().__init__(field, **kwargs)
self._value = initial
self._count = 0
def process_field(self, item):
if item is not None:
self._value += item
self._count += 1
def value(self):
return self._value / float(self._count)
class Median(StatsFilter):
"""Calculate the median of the values in a field. Field must contain
either int or float values.
**This filter keeps a list of field values in memory.**
"""
def __init__(self, field, **kwargs):
super().__init__(field, **kwargs)
self._values = []
def process_field(self, item):
if item is not None:
self._values.append(item)
def value(self):
return _median(self._values)
class MinMax(StatsFilter):
"""Find the minimum and maximum values in a field. Field must contain
either int or float values.
"""
def __init__(self, field, **kwargs):
super().__init__(field, **kwargs)
self._max = None
self._min = None
def process_field(self, item):
if item is not None:
if self._max is None or item > self._max:
self._max = item
if self._min is None or item < self._min:
self._min = item
def value(self):
return (self._min, self._max)
class StandardDeviation(StatsFilter):
"""Calculate the standard deviation of the values in a field. Calling
value() will return a standard deviation for the sample. Pass
population=True to value() for the standard deviation of the
population. Convenience methods are provided for average() and
median(). Field must contain either int or float values.
**This filter keeps a list of field values in memory.**
"""
def __init__(self, field, **kwargs):
super().__init__(field, **kwargs)
self._values = []
def process_field(self, item):
if item is not None:
self._values.append(item)
def average(self):
return _average(self._values)
def median(self):
return _median(self._values)
def value(self, population=False):
"""Return a tuple of (standard_deviation, variance).
:param population: True if values represents entire population,
False if values is a sample. Default: False
"""
return _stddev(self._values, population)
class Histogram(StatsFilter):
"""Generate a basic histogram of the specified field. The value() method
returns a dict of value to occurance count mappings. The __str__ method
generates a basic and limited histogram useful for printing to the
command line. The label_length attribute determines the padding and
cut-off of the basic histogram labels.
**This filters maintains a dict of unique field values in memory.**
"""
label_length = 6
def __init__(self, field, **kwargs):
super().__init__(field, **kwargs)
self._counter = collections.Counter()
def process_field(self, item):
self._counter[self.prep_field(item)] += 1
def prep_field(self, item):
return item
def value(self):
return self._counter.copy()
def in_order(self):
ordered = []
for key in sorted(self._counter.keys()):
ordered.append((key, self._counter[key]))
return ordered
def most_common(self, n=None):
return self._counter.most_common(n)
@classmethod
def as_string(self, occurences, label_length):
output = "\n"
for key, count in occurences:
key_str = str(key).ljust(label_length)[:label_length]
output += "%s %s\n" % (key_str, "*" * count)
return output
def __str__(self):
return Histogram.as_string(self.in_order(), label_length=self.label_length)

View File

@ -1,155 +0,0 @@
import os
from urllib.request import urlopen
"""
General utilities used within saucebrush that may be useful elsewhere.
"""
def get_django_model(dj_settings, app_label, model_name):
"""
Get a django model given a settings file, app label, and model name.
"""
from django.conf import settings
if not settings.configured:
settings.configure(
DATABASE_ENGINE=dj_settings.DATABASE_ENGINE,
DATABASE_NAME=dj_settings.DATABASE_NAME,
DATABASE_USER=dj_settings.DATABASE_USER,
DATABASE_PASSWORD=dj_settings.DATABASE_PASSWORD,
DATABASE_HOST=dj_settings.DATABASE_HOST,
INSTALLED_APPS=dj_settings.INSTALLED_APPS,
)
from django.db.models import get_model
return get_model(app_label, model_name)
def flatten(item, prefix="", separator="_", keys=None):
"""
Flatten nested dictionary into one with its keys concatenated together.
>>> flatten({'a':1, 'b':{'c':2}, 'd':[{'e':{'r':7}}, {'e':5}],
'f':{'g':{'h':6}}})
{'a': 1, 'b_c': 2, 'd': [{'e_r': 7}, {'e': 5}], 'f_g_h': 6}
"""
# update dictionaries recursively
if isinstance(item, dict):
# don't prepend a leading _
if prefix != "":
prefix += separator
retval = {}
for key, value in item.items():
if (not keys) or (key in keys):
retval.update(flatten(value, prefix + key, separator, keys))
else:
retval[prefix + key] = value
return retval
# elif isinstance(item, (tuple, list)):
# return {prefix: [flatten(i, prefix, separator, keys) for i in item]}
else:
return {prefix: item}
def str_or_list(obj):
if isinstance(obj, str):
return [obj]
else:
return obj
#
# utility classes
#
class Files:
"""Iterate over multiple files as a single file. Pass the paths of the
files as arguments to the class constructor:
for line in Files('/path/to/file/a', '/path/to/file/b'):
pass
"""
def __init__(self, *args):
self.paths = []
for arg in args:
self.add(arg)
self.file_open_callback = None
def add(self, path):
self.paths.append(path)
def __iter__(self):
return self.linereader()
def linereader(self):
for path in iter(self.paths):
if os.path.exists(path):
if self.file_open_callback:
self.file_open_callback(path)
f = open(path)
for line in f:
yield line
f.close()
class RemoteFile:
"""Stream data from a remote file.
:param url: URL to remote file
"""
def __init__(self, url):
self._url = url
def __iter__(self):
resp = urlopen(self._url)
for line in resp:
yield line.rstrip()
resp.close()
class ZippedFiles:
"""unpack a zipped collection of files on init.
Takes a string with file location or zipfile.ZipFile object
Best to wrap this in a Files() object, if the goal is to have a
linereader, as this only returns filelike objects.
if using a ZipFile object, make sure to set mode to 'a' or 'w' in order
to use the add() function.
"""
def __init__(self, zippedfile):
import zipfile
if type(zippedfile) == str:
self._zipfile = zipfile.ZipFile(zippedfile, "a")
else:
self._zipfile = zippedfile
self.paths = self._zipfile.namelist()
self.file_open_callback = None
def __iter__(self):
return self.filereader()
def add(self, path, dirname=None, arcname=None):
path_base = os.path.basename(path)
if dirname:
arcname = os.path.join(dirname, path_base)
if not arcname:
arcname = path_base
self._zipfile.write(path, arcname)
self.paths.append(path)
def filereader(self):
for path in iter(self.paths):
if self.file_open_callback:
self.file_open_callback(path)
yield self._zipfile.open(path)

View File

@ -1,107 +0,0 @@
from contextlib import closing
from io import StringIO
import os
from saucebrush.emitters import (
DebugEmitter,
CSVEmitter,
CountEmitter,
SqliteEmitter,
SqlDumpEmitter,
)
def test_debug_emitter():
with closing(StringIO()) as output:
de = DebugEmitter(output)
list(de.attach([1, 2, 3]))
assert output.getvalue() == "1\n2\n3\n"
def test_count_emitter():
# values for test
values = [
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
18,
19,
20,
21,
22,
]
with closing(StringIO()) as output:
# test without of parameter
ce = CountEmitter(every=10, outfile=output, format="%(count)s records\n")
list(ce.attach(values))
assert output.getvalue() == "10 records\n20 records\n"
ce.done()
assert output.getvalue() == "10 records\n20 records\n22 records\n"
with closing(StringIO()) as output:
# test with of parameter
ce = CountEmitter(every=10, outfile=output, of=len(values))
list(ce.attach(values))
assert output.getvalue() == "10 of 22\n20 of 22\n"
ce.done()
assert output.getvalue() == "10 of 22\n20 of 22\n22 of 22\n"
def test_csv_emitter():
io = StringIO() # if Python 3.x then use StringIO
with closing(io) as output:
ce = CSVEmitter(output, ("x", "y", "z"))
list(ce.attach([{"x": 1, "y": 2, "z": 3}, {"x": 5, "y": 5, "z": 5}]))
assert output.getvalue() == "x,y,z\r\n1,2,3\r\n5,5,5\r\n"
def test_sqlite_emitter():
import sqlite3
import tempfile
with closing(tempfile.NamedTemporaryFile(suffix=".db")) as f:
db_path = f.name
sle = SqliteEmitter(db_path, "testtable", fieldnames=("a", "b", "c"))
list(sle.attach([{"a": "1", "b": "2", "c": "3"}]))
sle.done()
with closing(sqlite3.connect(db_path)) as conn:
cur = conn.cursor()
cur.execute("""SELECT a, b, c FROM testtable""")
results = cur.fetchall()
os.unlink(db_path)
assert results == [("1", "2", "3")]
def test_sql_dump_emitter():
with closing(StringIO()) as bffr:
sde = SqlDumpEmitter(bffr, "testtable", ("a", "b"))
list(sde.attach([{"a": 1, "b": "2"}]))
sde.done()
assert bffr.getvalue() == "INSERT INTO `testtable` (`a`,`b`) VALUES (1,'2');\n"

View File

@ -1,355 +0,0 @@
import unittest
import types
from saucebrush.filters import (
Filter,
YieldFilter,
FieldFilter,
SubrecordFilter,
ConditionalPathFilter,
ConditionalFilter,
FieldModifier,
FieldKeeper,
FieldRemover,
FieldMerger,
FieldAdder,
FieldCopier,
FieldRenamer,
Unique,
)
class DummyRecipe:
rejected_record = None
rejected_msg = None
def reject_record(self, record, msg):
self.rejected_record = record
self.rejected_msg = msg
class Doubler(Filter):
def process_record(self, record):
return record * 2
class OddRemover(Filter):
def process_record(self, record):
if record % 2 == 0:
return record
else:
return None # explicitly return None
class ListFlattener(YieldFilter):
def process_record(self, record):
for item in record:
yield item
class FieldDoubler(FieldFilter):
def process_field(self, item):
return item * 2
class NonModifyingFieldDoubler(Filter):
def __init__(self, key):
self.key = key
def process_record(self, record):
record = dict(record)
record[self.key] *= 2
return record
class ConditionalOddRemover(ConditionalFilter):
def test_record(self, record):
# return True for even values
return record % 2 == 0
class FilterTestCase(unittest.TestCase):
def _simple_data(self):
return [
{"a": 1, "b": 2, "c": 3},
{"a": 5, "b": 5, "c": 5},
{"a": 1, "b": 10, "c": 100},
]
def assert_filter_result(self, filter_obj, expected_data):
result = filter_obj.attach(self._simple_data())
self.assertEqual(list(result), expected_data)
def test_reject_record(self):
recipe = DummyRecipe()
f = Doubler()
result = f.attach([1, 2, 3], recipe=recipe)
# next has to be called for attach to take effect
next(result)
f.reject_record("bad", "this one was bad")
# ensure that the rejection propagated to the recipe
self.assertEqual("bad", recipe.rejected_record)
self.assertEqual("this one was bad", recipe.rejected_msg)
def test_simple_filter(self):
df = Doubler()
result = df.attach([1, 2, 3])
# ensure we got a generator that yields 2,4,6
self.assertEqual(type(result), types.GeneratorType)
self.assertEqual(list(result), [2, 4, 6])
def test_simple_filter_return_none(self):
cf = OddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEqual(list(result), [0, 2, 4, 6, 8])
def test_simple_yield_filter(self):
lf = ListFlattener()
result = lf.attach([[1], [2, 3], [4, 5, 6]])
# ensure we got a generator that yields 1,2,3,4,5,6
self.assertEqual(type(result), types.GeneratorType)
self.assertEqual(list(result), [1, 2, 3, 4, 5, 6])
def test_simple_field_filter(self):
ff = FieldDoubler(["a", "c"])
# check against expected data
expected_data = [
{"a": 2, "b": 2, "c": 6},
{"a": 10, "b": 5, "c": 10},
{"a": 2, "b": 10, "c": 200},
]
self.assert_filter_result(ff, expected_data)
def test_conditional_filter(self):
cf = ConditionalOddRemover()
result = cf.attach(range(10))
# ensure only even numbers remain
self.assertEqual(list(result), [0, 2, 4, 6, 8])
# Tests for Subrecord
def test_subrecord_filter_list(self):
data = [
{"a": [{"b": 2}, {"b": 4}]},
{"a": [{"b": 5}]},
{"a": [{"b": 8}, {"b": 2}, {"b": 1}]},
]
expected = [
{"a": [{"b": 4}, {"b": 8}]},
{"a": [{"b": 10}]},
{"a": [{"b": 16}, {"b": 4}, {"b": 2}]},
]
sf = SubrecordFilter("a", NonModifyingFieldDoubler("b"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_deep(self):
data = [
{"a": {"d": [{"b": 2}, {"b": 4}]}},
{"a": {"d": [{"b": 5}]}},
{"a": {"d": [{"b": 8}, {"b": 2}, {"b": 1}]}},
]
expected = [
{"a": {"d": [{"b": 4}, {"b": 8}]}},
{"a": {"d": [{"b": 10}]}},
{"a": {"d": [{"b": 16}, {"b": 4}, {"b": 2}]}},
]
sf = SubrecordFilter("a.d", NonModifyingFieldDoubler("b"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_nonlist(self):
data = [
{"a": {"b": {"c": 1}}},
{"a": {"b": {"c": 2}}},
{"a": {"b": {"c": 3}}},
]
expected = [
{"a": {"b": {"c": 2}}},
{"a": {"b": {"c": 4}}},
{"a": {"b": {"c": 6}}},
]
sf = SubrecordFilter("a.b", NonModifyingFieldDoubler("c"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_subrecord_filter_list_in_path(self):
data = [
{"a": [{"b": {"c": 5}}, {"b": {"c": 6}}]},
{"a": [{"b": {"c": 1}}, {"b": {"c": 2}}, {"b": {"c": 3}}]},
{"a": [{"b": {"c": 2}}]},
]
expected = [
{"a": [{"b": {"c": 10}}, {"b": {"c": 12}}]},
{"a": [{"b": {"c": 2}}, {"b": {"c": 4}}, {"b": {"c": 6}}]},
{"a": [{"b": {"c": 4}}]},
]
sf = SubrecordFilter("a.b", NonModifyingFieldDoubler("c"))
result = sf.attach(data)
self.assertEqual(list(result), expected)
def test_conditional_path(self):
predicate = lambda r: r["a"] == 1 # noqa
# double b if a == 1, otherwise double c
cpf = ConditionalPathFilter(predicate, FieldDoubler("b"), FieldDoubler("c"))
expected_data = [
{"a": 1, "b": 4, "c": 3},
{"a": 5, "b": 5, "c": 10},
{"a": 1, "b": 20, "c": 100},
]
self.assert_filter_result(cpf, expected_data)
# Tests for Generic Filters
def test_field_modifier(self):
# another version of FieldDoubler
fm = FieldModifier(["a", "c"], lambda x: x * 2)
# check against expected data
expected_data = [
{"a": 2, "b": 2, "c": 6},
{"a": 10, "b": 5, "c": 10},
{"a": 2, "b": 10, "c": 200},
]
self.assert_filter_result(fm, expected_data)
def test_field_keeper(self):
fk = FieldKeeper(["c"])
# check against expected results
expected_data = [{"c": 3}, {"c": 5}, {"c": 100}]
self.assert_filter_result(fk, expected_data)
def test_field_remover(self):
fr = FieldRemover(["a", "b"])
# check against expected results
expected_data = [{"c": 3}, {"c": 5}, {"c": 100}]
self.assert_filter_result(fr, expected_data)
def test_field_merger(self):
fm = FieldMerger({"sum": ("a", "b", "c")}, lambda x, y, z: x + y + z)
# check against expected results
expected_data = [{"sum": 6}, {"sum": 15}, {"sum": 111}]
self.assert_filter_result(fm, expected_data)
def test_field_merger_keep_fields(self):
fm = FieldMerger(
{"sum": ("a", "b", "c")}, lambda x, y, z: x + y + z, keep_fields=True
)
# check against expected results
expected_data = [
{"a": 1, "b": 2, "c": 3, "sum": 6},
{"a": 5, "b": 5, "c": 5, "sum": 15},
{"a": 1, "b": 10, "c": 100, "sum": 111},
]
self.assert_filter_result(fm, expected_data)
def test_field_adder_scalar(self):
fa = FieldAdder("x", 7)
expected_data = [
{"a": 1, "b": 2, "c": 3, "x": 7},
{"a": 5, "b": 5, "c": 5, "x": 7},
{"a": 1, "b": 10, "c": 100, "x": 7},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_callable(self):
fa = FieldAdder("x", lambda: 7)
expected_data = [
{"a": 1, "b": 2, "c": 3, "x": 7},
{"a": 5, "b": 5, "c": 5, "x": 7},
{"a": 1, "b": 10, "c": 100, "x": 7},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_iterable(self):
fa = FieldAdder("x", [1, 2, 3])
expected_data = [
{"a": 1, "b": 2, "c": 3, "x": 1},
{"a": 5, "b": 5, "c": 5, "x": 2},
{"a": 1, "b": 10, "c": 100, "x": 3},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_replace(self):
fa = FieldAdder("b", lambda: 7)
expected_data = [
{"a": 1, "b": 7, "c": 3},
{"a": 5, "b": 7, "c": 5},
{"a": 1, "b": 7, "c": 100},
]
self.assert_filter_result(fa, expected_data)
def test_field_adder_no_replace(self):
fa = FieldAdder("b", lambda: 7, replace=False)
expected_data = [
{"a": 1, "b": 2, "c": 3},
{"a": 5, "b": 5, "c": 5},
{"a": 1, "b": 10, "c": 100},
]
self.assert_filter_result(fa, expected_data)
def test_field_copier(self):
fc = FieldCopier({"a2": "a", "b2": "b"})
expected_data = [
{"a": 1, "b": 2, "c": 3, "a2": 1, "b2": 2},
{"a": 5, "b": 5, "c": 5, "a2": 5, "b2": 5},
{"a": 1, "b": 10, "c": 100, "a2": 1, "b2": 10},
]
self.assert_filter_result(fc, expected_data)
def test_field_renamer(self):
fr = FieldRenamer({"x": "a", "y": "b"})
expected_data = [
{"x": 1, "y": 2, "c": 3},
{"x": 5, "y": 5, "c": 5},
{"x": 1, "y": 10, "c": 100},
]
self.assert_filter_result(fr, expected_data)
# TODO: splitter & flattner tests?
def test_unique_filter(self):
u = Unique()
in_data = [{"a": 77}, {"a": 33}, {"a": 77}]
expected_data = [{"a": 77}, {"a": 33}]
result = u.attach(in_data)
self.assertEqual(list(result), expected_data)
# TODO: unicode & string filter tests
if __name__ == "__main__":
unittest.main()

View File

@ -1,49 +0,0 @@
import pytest
from saucebrush import Recipe, run_recipe, SaucebrushError, OvercookedError
from saucebrush.filters import Filter
class Raiser(Filter):
def process_record(self, record):
raise Exception("bad record")
class Saver(Filter):
def __init__(self):
self.saved = []
def process_record(self, record):
self.saved.append(record)
return record
def test_error_stream():
saver = Saver()
recipe = Recipe(Raiser(), error_stream=saver)
recipe.run([{"a": 1}, {"b": 2}])
recipe.done()
assert saver.saved[0]["record"] == {"a": 1}
assert saver.saved[1]["record"] == {"b": 2}
# Must pass either a Recipe, a Filter or an iterable of Filters
# as the error_stream argument
assert pytest.raises(SaucebrushError, Recipe, error_stream=5)
def test_run_recipe():
saver = Saver()
run_recipe([1, 2], saver)
assert saver.saved == [1, 2]
def test_done():
saver = Saver()
recipe = Recipe(saver)
recipe.run([1])
recipe.done()
assert pytest.raises(OvercookedError, recipe.run, [2])
assert pytest.raises(OvercookedError, recipe.done)
assert saver.saved == [1]

View File

@ -1,90 +0,0 @@
from io import StringIO
from saucebrush.sources import (
CSVSource,
FixedWidthFileSource,
HtmlTableSource,
JSONSource,
)
def _get_csv():
data = """a,b,c
1,2,3
5,5,5
1,10,100"""
return StringIO(data)
def test_csv_source_basic():
source = CSVSource(_get_csv())
expected_data = [
{"a": "1", "b": "2", "c": "3"},
{"a": "5", "b": "5", "c": "5"},
{"a": "1", "b": "10", "c": "100"},
]
assert list(source) ==expected_data
def test_csv_source_fieldnames():
source = CSVSource(_get_csv(), ["x", "y", "z"])
expected_data = [
{"x": "a", "y": "b", "z": "c"},
{"x": "1", "y": "2", "z": "3"},
{"x": "5", "y": "5", "z": "5"},
{"x": "1", "y": "10", "z": "100"},
]
assert list(source) == expected_data
def test_csv_source_skiprows():
source = CSVSource(_get_csv(), skiprows=1)
expected_data = [
{"a": "5", "b": "5", "c": "5"},
{"a": "1", "b": "10", "c": "100"},
]
assert list(source) == expected_data
def test_fixed_width_source():
data = StringIO("JamesNovember 3 1986\nTim September151999")
fields = (("name", 5), ("month", 9), ("day", 2), ("year", 4))
source = FixedWidthFileSource(data, fields)
expected_data = [
{"name": "James", "month": "November", "day": "3", "year": "1986"},
{"name": "Tim", "month": "September", "day": "15", "year": "1999"},
]
assert list(source) == expected_data
def test_json_source():
content = StringIO("""[{"a": 1, "b": "2", "c": 3}]""")
js = JSONSource(content)
assert list(js) == [{"a": 1, "b": "2", "c": 3}]
def test_html_table_source():
content = StringIO(
"""
<html>
<table id="thetable">
<tr>
<th>a</th>
<th>b</th>
<th>c</th>
</tr>
<tr>
<td>1</td>
<td>2</td>
<td>3</td>
</tr>
</table>
</html>
"""
)
hts = HtmlTableSource(content, "thetable")
assert list(hts) == [{"a": "1", "b": "2", "c": "3"}]

View File

@ -1,55 +0,0 @@
from saucebrush.stats import Sum, Average, Median, MinMax, StandardDeviation, Histogram
def _simple_data():
return [
{"a": 1, "b": 2, "c": 3},
{"a": 5, "b": 5, "c": 5},
{"a": 1, "b": 10, "c": 100},
]
def test_sum():
fltr = Sum("b")
list(fltr.attach(_simple_data()))
assert fltr.value() == 17
def test_average():
fltr = Average("c")
list(fltr.attach(_simple_data()))
assert fltr.value() == 36.0
def test_median():
# odd number of values
fltr = Median("a")
list(fltr.attach(_simple_data()))
assert fltr.value() == 1
# even number of values
fltr = Median("a")
list(fltr.attach(_simple_data()[:2]))
assert fltr.value() == 3
def test_minmax():
fltr = MinMax("b")
list(fltr.attach(_simple_data()))
assert fltr.value() == (2, 10)
def test_standard_deviation():
fltr = StandardDeviation("c")
list(fltr.attach(_simple_data()))
assert fltr.average() == 36.0
assert fltr.median() == 5
assert fltr.value() == (55.4346462061408, 3073.0)
assert fltr.value(True) == (45.2621990922521, 2048.6666666666665)
def test_histogram():
fltr = Histogram("a")
fltr.label_length = 1
list(fltr.attach(_simple_data()))
assert str(fltr) == "\n1 **\n5 *\n"