databeakers

This commit is contained in:
James Turk 2023-08-04 20:49:57 -05:00
parent c21954ebba
commit b357a6d1d5
10 changed files with 127 additions and 789 deletions

View File

@ -1,61 +0,0 @@
from beakers.recipe import Recipe
import pydantic
class Word(pydantic.BaseModel):
word: str
class ClassifiedWord(pydantic.BaseModel):
normalized_word: str
is_fruit: bool
class Sentence(pydantic.BaseModel):
sentence: list[str]
def word_classifier(item) -> ClassifiedWord:
return ClassifiedWord(
normalized_word=item.word.lower(),
is_fruit=item.word.lower()
in (
"apple",
"banana",
"fig",
"grape",
"lemon",
"mango",
"orange",
"pear",
"raspberry",
),
)
recipe = Recipe("fruits-example")
recipe.add_beaker("word", Word)
recipe.add_beaker("classified_word", ClassifiedWord)
recipe.add_beaker("sentence", Sentence)
recipe.add_transform("word", "classified_word", word_classifier)
recipe.add_conditional(
"classified_word",
lambda cw: cw.is_fruit,
"fruits",
)
recipe.add_transform(
"fruits",
"sentence",
lambda x: Sentence(sentence=f"I love a fresh {x.normalized_word}".split()),
)
recipe.add_seed(
"word",
[
Word(word="apple"),
Word(word="bAnAnA"),
Word(word="hammer"),
Word(word="orange"),
Word(word="EGG"),
],
)

245
poetry.lock generated
View File

@ -1,10 +1,9 @@
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
[[package]]
name = "annotated-types"
version = "0.5.0"
description = "Reusable constraint types to use with typing.Annotated"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -16,7 +15,6 @@ files = [
name = "anyio"
version = "3.7.1"
description = "High level compatibility layer for multiple asynchronous event loop implementations"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -37,7 +35,6 @@ trio = ["trio (<0.22)"]
name = "certifi"
version = "2023.5.7"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -49,7 +46,6 @@ files = [
name = "charset-normalizer"
version = "3.2.0"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = false
python-versions = ">=3.7.0"
files = [
@ -130,11 +126,24 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
]
[[package]]
name = "click"
version = "8.1.6"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"},
{file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "dev"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
files = [
@ -142,11 +151,32 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "databeakers"
version = "0.1.0"
description = ""
optional = false
python-versions = "^3.10"
files = []
develop = true
[package.dependencies]
httpx = "^0.24.0"
networkx = "^3.1"
pydantic = "^2.0.2"
rich = "^13.4.2"
scrapelib = "^2.1.0"
structlog = "^23.1.0"
typer = "^0.9.0"
[package.source]
type = "directory"
url = "../beakers"
[[package]]
name = "h11"
version = "0.14.0"
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -158,7 +188,6 @@ files = [
name = "httpcore"
version = "0.17.3"
description = "A minimal low-level HTTP client."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -170,17 +199,16 @@ files = [
anyio = ">=3.0,<5.0"
certifi = "*"
h11 = ">=0.13,<0.15"
sniffio = ">=1.0.0,<2.0.0"
sniffio = "==1.*"
[package.extras]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (>=1.0.0,<2.0.0)"]
socks = ["socksio (==1.*)"]
[[package]]
name = "httpx"
version = "0.24.1"
description = "The next generation HTTP client."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -196,15 +224,14 @@ sniffio = "*"
[package.extras]
brotli = ["brotli", "brotlicffi"]
cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
http2 = ["h2 (>=3,<5)"]
socks = ["socksio (>=1.0.0,<2.0.0)"]
socks = ["socksio (==1.*)"]
[[package]]
name = "idna"
version = "3.4"
description = "Internationalized Domain Names in Applications (IDNA)"
category = "main"
optional = false
python-versions = ">=3.5"
files = [
@ -213,80 +240,44 @@ files = [
]
[[package]]
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
category = "dev"
name = "markdown-it-py"
version = "3.0.0"
description = "Python port of markdown-it. Markdown parsing, done right!"
optional = false
python-versions = ">=3.7"
python-versions = ">=3.8"
files = [
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
]
[[package]]
name = "mypy"
version = "1.4.1"
description = "Optional static typing for Python"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
{file = "mypy-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:566e72b0cd6598503e48ea610e0052d1b8168e60a46e0bfd34b3acf2d57f96a8"},
{file = "mypy-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca637024ca67ab24a7fd6f65d280572c3794665eaf5edcc7e90a866544076878"},
{file = "mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dde1d180cd84f0624c5dcaaa89c89775550a675aff96b5848de78fb11adabcd"},
{file = "mypy-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c4d8e89aa7de683e2056a581ce63c46a0c41e31bd2b6d34144e2c80f5ea53dc"},
{file = "mypy-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:bfdca17c36ae01a21274a3c387a63aa1aafe72bff976522886869ef131b937f1"},
{file = "mypy-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7549fbf655e5825d787bbc9ecf6028731973f78088fbca3a1f4145c39ef09462"},
{file = "mypy-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98324ec3ecf12296e6422939e54763faedbfcc502ea4a4c38502082711867258"},
{file = "mypy-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141dedfdbfe8a04142881ff30ce6e6653c9685b354876b12e4fe6c78598b45e2"},
{file = "mypy-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8207b7105829eca6f3d774f64a904190bb2231de91b8b186d21ffd98005f14a7"},
{file = "mypy-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:16f0db5b641ba159eff72cff08edc3875f2b62b2fa2bc24f68c1e7a4e8232d01"},
{file = "mypy-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:470c969bb3f9a9efcedbadcd19a74ffb34a25f8e6b0e02dae7c0e71f8372f97b"},
{file = "mypy-1.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5952d2d18b79f7dc25e62e014fe5a23eb1a3d2bc66318df8988a01b1a037c5b"},
{file = "mypy-1.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:190b6bab0302cec4e9e6767d3eb66085aef2a1cc98fe04936d8a42ed2ba77bb7"},
{file = "mypy-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9d40652cc4fe33871ad3338581dca3297ff5f2213d0df345bcfbde5162abf0c9"},
{file = "mypy-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01fd2e9f85622d981fd9063bfaef1aed6e336eaacca00892cd2d82801ab7c042"},
{file = "mypy-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2460a58faeea905aeb1b9b36f5065f2dc9a9c6e4c992a6499a2360c6c74ceca3"},
{file = "mypy-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2746d69a8196698146a3dbe29104f9eb6a2a4d8a27878d92169a6c0b74435b6"},
{file = "mypy-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ae704dcfaa180ff7c4cfbad23e74321a2b774f92ca77fd94ce1049175a21c97f"},
{file = "mypy-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:43d24f6437925ce50139a310a64b2ab048cb2d3694c84c71c3f2a1626d8101dc"},
{file = "mypy-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c482e1246726616088532b5e964e39765b6d1520791348e6c9dc3af25b233828"},
{file = "mypy-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43b592511672017f5b1a483527fd2684347fdffc041c9ef53428c8dc530f79a3"},
{file = "mypy-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34a9239d5b3502c17f07fd7c0b2ae6b7dd7d7f6af35fbb5072c6208e76295816"},
{file = "mypy-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5703097c4936bbb9e9bce41478c8d08edd2865e177dc4c52be759f81ee4dd26c"},
{file = "mypy-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e02d700ec8d9b1859790c0475df4e4092c7bf3272a4fd2c9f33d87fac4427b8f"},
{file = "mypy-1.4.1-py3-none-any.whl", hash = "sha256:45d32cec14e7b97af848bddd97d85ea4f0db4d5a149ed9676caa4eb2f7402bb4"},
{file = "mypy-1.4.1.tar.gz", hash = "sha256:9bbcd9ab8ea1f2e1c8031c21445b511442cc45c89951e49bbf852cbb70755b1b"},
{file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
{file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
]
[package.dependencies]
mypy-extensions = ">=1.0.0"
typing-extensions = ">=4.1.0"
mdurl = ">=0.1,<1.0"
[package.extras]
dmypy = ["psutil (>=4.0)"]
install-types = ["pip"]
python2 = ["typed-ast (>=1.4.0,<2)"]
reports = ["lxml"]
benchmarking = ["psutil", "pytest", "pytest-benchmark"]
code-style = ["pre-commit (>=3.0,<4.0)"]
compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
linkify = ["linkify-it-py (>=1,<3)"]
plugins = ["mdit-py-plugins"]
profiling = ["gprof2dot"]
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
description = "Type system extensions for programs checked with the mypy type checker."
category = "dev"
name = "mdurl"
version = "0.1.2"
description = "Markdown URL utilities"
optional = false
python-versions = ">=3.5"
python-versions = ">=3.7"
files = [
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
]
[[package]]
name = "networkx"
version = "3.1"
description = "Python package for creating and manipulating graphs and networks"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -301,39 +292,10 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
[[package]]
name = "packaging"
version = "23.1"
description = "Core utilities for Python packages"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
{file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
{file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
]
[[package]]
name = "pluggy"
version = "1.2.0"
description = "plugin and hook calling mechanisms for python"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
{file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
{file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
]
[package.extras]
dev = ["pre-commit", "tox"]
testing = ["pytest", "pytest-benchmark"]
[[package]]
name = "pydantic"
version = "2.0.2"
description = "Data validation using Python type hints"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -353,7 +315,6 @@ email = ["email-validator (>=2.0.0)"]
name = "pydantic-core"
version = "2.1.2"
description = ""
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -464,31 +425,23 @@ files = [
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pytest"
version = "7.4.0"
description = "pytest: simple powerful testing with Python"
category = "dev"
name = "pygments"
version = "2.15.1"
description = "Pygments is a syntax highlighting package written in Python."
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
{file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
{file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"},
{file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"},
]
[package.dependencies]
colorama = {version = "*", markers = "sys_platform == \"win32\""}
iniconfig = "*"
packaging = "*"
pluggy = ">=0.12,<2.0"
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
plugins = ["importlib-metadata"]
[[package]]
name = "requests"
version = "2.31.0"
description = "Python HTTP for Humans."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -506,11 +459,28 @@ urllib3 = ">=1.21.1,<3"
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "rich"
version = "13.5.2"
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
optional = false
python-versions = ">=3.7.0"
files = [
{file = "rich-13.5.2-py3-none-any.whl", hash = "sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808"},
{file = "rich-13.5.2.tar.gz", hash = "sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39"},
]
[package.dependencies]
markdown-it-py = ">=2.2.0"
pygments = ">=2.13.0,<3.0.0"
[package.extras]
jupyter = ["ipywidgets (>=7.5.1,<9)"]
[[package]]
name = "scrapelib"
version = "2.2.0"
description = ""
category = "main"
optional = false
python-versions = ">=3.7,<4.0"
files = [
@ -526,7 +496,6 @@ urllib3 = ">=1.26,<2.0"
name = "sniffio"
version = "1.3.0"
description = "Sniff out which async library your code is running under"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -534,11 +503,48 @@ files = [
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
]
[[package]]
name = "structlog"
version = "23.1.0"
description = "Structured Logging for Python"
optional = false
python-versions = ">=3.7"
files = [
{file = "structlog-23.1.0-py3-none-any.whl", hash = "sha256:79b9e68e48b54e373441e130fa447944e6f87a05b35de23138e475c05d0f7e0e"},
{file = "structlog-23.1.0.tar.gz", hash = "sha256:270d681dd7d163c11ba500bc914b2472d2b50a8ef00faa999ded5ff83a2f906b"},
]
[package.extras]
dev = ["structlog[docs,tests,typing]"]
docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-mermaid", "twisted"]
tests = ["coverage[toml]", "freezegun (>=0.2.8)", "pretend", "pytest (>=6.0)", "pytest-asyncio (>=0.17)", "simplejson"]
typing = ["mypy", "rich", "twisted"]
[[package]]
name = "typer"
version = "0.9.0"
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
optional = false
python-versions = ">=3.6"
files = [
{file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
{file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
]
[package.dependencies]
click = ">=7.1.1,<9.0.0"
typing-extensions = ">=3.7.4.3"
[package.extras]
all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
[[package]]
name = "typing-extensions"
version = "4.7.1"
description = "Backported and Experimental Type Hints for Python 3.7+"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -550,7 +556,6 @@ files = [
name = "urllib3"
version = "1.26.16"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
files = [
@ -566,4 +571,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "136157484750ab7b80e7896a5dc2905b4f9c7ed20b47dadba4e6ae45f0f6c289"
content-hash = "a121f92ed7a60a9138f1e19bbf88220cd0a5b25c0287f031e36cec0a79188530"

View File

@ -1,27 +1,15 @@
[tool.poetry]
name = "beakers"
name = "foiaghost"
version = "0.1.0"
description = ""
authors = ["James Turk <dev@jamesturk.net>"]
readme = "README.md"
[tool.poetry.scripts]
bkr = 'beakers.cli:app'
[tool.poetry.dependencies]
python = "^3.11"
#scrapeghost = {path = "../scrapeghost", develop = true}
scrapelib = "^2.1.0"
httpx = "^0.24.0"
networkx = "^3.1"
pydantic = "^2.0.2"
databeakers = {path = "../beakers", develop = true}
[tool.poetry.group.dev.dependencies]
pytest = "^7.4.0"
mypy = "^1.4.1"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

View File

@ -1 +0,0 @@
from .recipe import Recipe

View File

@ -1,99 +0,0 @@
import abc
import json
import sqlite3
import uuid
from pydantic import BaseModel
from typing import Iterable, Type, TYPE_CHECKING
if TYPE_CHECKING:
from .recipe import Recipe
PydanticModel = Type[BaseModel]
class Beaker(abc.ABC):
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
self.name = name
self.model = model
self.recipe = recipe
def __repr__(self) -> str:
return f"Beaker({self.name}, {self.model.__name__})"
@abc.abstractmethod
def items(self) -> Iterable[tuple[str, BaseModel]]:
pass
@abc.abstractmethod
def __len__(self) -> int:
pass
@abc.abstractmethod
def add_item(self, item: BaseModel, id: str | None = None) -> None:
pass
@abc.abstractmethod
def reset(self) -> None:
pass
def add_items(self, items: Iterable[BaseModel]) -> None:
for item in items:
self.add_item(item)
def id_set(self) -> set[str]:
return set(id for id, _ in self.items())
class TempBeaker(Beaker):
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
super().__init__(name, model, recipe)
self._items: list[tuple[str, BaseModel]] = []
def __len__(self) -> int:
return len(self._items)
def add_item(self, item: BaseModel, id: str | None = None) -> None:
if id is None:
id = str(uuid.uuid1())
self._items.append((id, item))
def items(self) -> Iterable[tuple[str, BaseModel]]:
yield from self._items
def reset(self) -> None:
self._items = []
class SqliteBeaker(Beaker):
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
super().__init__(name, model, recipe)
# create table if it doesn't exist
self.cursor = self.recipe.db.cursor()
self.cursor.row_factory = sqlite3.Row # type: ignore
self.cursor.execute(
f"CREATE TABLE IF NOT EXISTS {self.name} (uuid TEXT PRIMARY KEY, data JSON)"
)
def items(self) -> Iterable[tuple[str, BaseModel]]:
self.cursor.execute(f"SELECT uuid, data FROM {self.name}")
data = self.cursor.fetchall()
for item in data:
yield item["uuid"], self.model(**json.loads(item["data"]))
def __len__(self) -> int:
self.cursor.execute(f"SELECT COUNT(*) FROM {self.name}")
return self.cursor.fetchone()[0]
def add_item(self, item: BaseModel, id: str | None = None) -> None:
if id is None:
id = str(uuid.uuid1())
print("UUID", id, item)
self.cursor.execute(
f"INSERT INTO {self.name} (uuid, data) VALUES (?, ?)",
(id, item.model_dump_json()),
)
self.recipe.db.commit()
def reset(self) -> None:
self.cursor.execute(f"DELETE FROM {self.name}")
self.recipe.db.commit()

View File

@ -1,77 +0,0 @@
import importlib
from types import SimpleNamespace
import typer
import sys
from pprint import pprint
from typing import List, Optional
from typing_extensions import Annotated
from beakers.beakers import SqliteBeaker
app = typer.Typer()
def _load_recipe(dotted_path: str) -> SimpleNamespace:
sys.path.append(".")
path, name = dotted_path.rsplit(".", 1)
mod = importlib.import_module(path)
return getattr(mod, name)
@app.callback()
def main(
ctx: typer.Context,
recipe: str = typer.Option(None, envvar="BEAKER_RECIPE"),
) -> None:
if not recipe:
typer.secho(
"Missing recipe; pass --recipe or set env[BEAKER_RECIPE]",
fg=typer.colors.RED,
)
raise typer.Exit(1)
ctx.obj = _load_recipe(recipe)
@app.command()
def reset(ctx: typer.Context) -> None:
for beaker in ctx.obj.beakers.values():
if isinstance(beaker, SqliteBeaker):
if bl := len(beaker):
beaker.reset()
typer.secho(f"{beaker.name} reset ({bl})", fg=typer.colors.RED)
else:
typer.secho(f"{beaker.name} empty", fg=typer.colors.GREEN)
@app.command()
def show(ctx: typer.Context) -> None:
ctx.obj.show()
@app.command()
def graph(ctx: typer.Context) -> None:
pprint(ctx.obj.graph_data())
@app.command()
def run(
ctx: typer.Context,
input: Annotated[Optional[List[str]], typer.Option(...)] = None,
start: Optional[str] = typer.Option(None),
end: Optional[str] = typer.Option(None),
) -> None:
if ctx.obj.seeds:
typer.secho("Seeding beakers", fg=typer.colors.GREEN)
ctx.obj.process_seeds()
has_data = any(ctx.obj.beakers.values())
if not input and not has_data:
typer.secho("No data; pass --input to seed beaker(s)", fg=typer.colors.RED)
raise typer.Exit(1)
for input_str in input or []:
beaker, filename = input_str.split("=")
ctx.obj.csv_to_beaker(filename, beaker)
ctx.obj.run_once(start, end)
if __name__ == "__main__":
app()

View File

@ -1,41 +0,0 @@
import httpx
from pydantic import BaseModel, Field
import datetime
class HttpResponse(BaseModel):
"""
Beaker data type that represents an HTTP response.
"""
url: str
status_code: int
response_body: str
retrieved_at: datetime.datetime = Field(default_factory=datetime.datetime.now)
class HttpRequest:
"""
Filter that converts from a beaker with a URL to a beaker with an HTTP response.
"""
def __init__(self, beaker: str, field: str):
"""
Args:
beaker: The name of the beaker that contains the URL.
field: The name of the field in the beaker that contains the URL.
"""
self.beaker = beaker
self.field = field
async def __call__(self, item: BaseModel) -> HttpResponse:
url = getattr(item, self.field)
async with httpx.AsyncClient() as client:
response = await client.get(url)
return HttpResponse(
url=url,
status_code=response.status_code,
response_body=response.text,
)

View File

@ -1,319 +0,0 @@
import csv
import json
import typer
import inspect
import sqlite3
import hashlib
import asyncio
import networkx # type: ignore
from collections import defaultdict, Counter
from typing import Iterable, Callable, Type
from pydantic import BaseModel, ConfigDict
from structlog import get_logger
from .beakers import Beaker, SqliteBeaker, TempBeaker
log = get_logger()
def get_sha512(filename: str) -> str:
with open(filename, "rb") as file:
return hashlib.sha512(file.read()).hexdigest()
class Transform(BaseModel):
model_config = ConfigDict(frozen=True)
name: str
transform_func: Callable
error_map: dict[tuple, str]
class ErrorType(BaseModel):
item: BaseModel
exception: str
exc_type: str
def if_cond_true(data_cond_tup: tuple[dict, bool]) -> dict | None:
return data_cond_tup[0] if data_cond_tup[1] else None
def if_cond_false(data_cond_tup: tuple[dict, bool]) -> dict | None:
return data_cond_tup[0] if not data_cond_tup[1] else None
class Recipe:
def __init__(self, name: str, db_name: str = "beakers.db"):
self.name = name
self.graph = networkx.DiGraph()
self.beakers: dict[str, Beaker] = {}
self.seeds: defaultdict[str, list[Iterable[BaseModel]]] = defaultdict(list)
self.db = sqlite3.connect(db_name)
cursor = self.db.cursor()
cursor.execute(
"CREATE TABLE IF NOT EXISTS _metadata (table_name TEXT PRIMARY KEY, data JSON)"
)
def __repr__(self) -> str:
return f"Recipe({self.name})"
def add_beaker(
self,
name: str,
datatype: Type[BaseModel],
beaker_type: Type[Beaker] = SqliteBeaker,
) -> Beaker:
self.graph.add_node(name, datatype=datatype)
if datatype is None:
self.beakers[name] = TempBeaker(name, datatype, self)
else:
self.beakers[name] = SqliteBeaker(name, datatype, self)
return self.beakers[name]
def add_transform(
self,
from_beaker: str,
to_beaker: str,
transform_func: Callable,
*,
name: str | None = None,
error_map: dict[tuple, str] | None = None,
) -> None:
if name is None:
name = transform_func.__name__
if name == "<lambda>":
name = "λ"
transform = Transform(
name=name,
transform_func=transform_func,
error_map=error_map or {},
)
self.graph.add_edge(
from_beaker,
to_beaker,
transform=transform,
)
def add_conditional(
self,
from_beaker: str,
condition_func: Callable,
if_true: str,
if_false: str = "",
) -> None:
# first add a transform to evaluate the conditional
if condition_func.__name__ == "<lambda>":
cond_name = f"cond-{from_beaker}"
else:
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
self.add_beaker(cond_name, None)
self.add_transform(
from_beaker,
cond_name,
lambda data: (data, condition_func(data)),
name=cond_name,
)
# then add two filtered paths that remove the condition result
self.add_beaker(if_true, None)
self.add_transform(
cond_name,
if_true,
if_cond_true,
)
if if_false:
self.add_transform(
cond_name,
if_false,
if_cond_false,
)
def add_seed(self, beaker_name: str, data: Iterable[BaseModel]) -> None:
self.seeds[beaker_name].append(data)
def process_seeds(self) -> None:
log.info("process_seeds", recipe=self.name)
for beaker_name, seeds in self.seeds.items():
for seed in seeds:
self.beakers[beaker_name].add_items(seed)
def get_metadata(self, table_name: str) -> dict:
cursor = self.db.cursor()
cursor.execute(
"SELECT data FROM _metadata WHERE table_name = ?",
(table_name,),
)
try:
data = cursor.fetchone()["data"]
log.debug("get_metadata", table_name=table_name, data=data)
return json.loads(data)
except TypeError:
log.debug("get_metadata", table_name=table_name, data={})
return {}
def save_metadata(self, table_name: str, data: dict) -> None:
data_json = json.dumps(data)
log.info("save_metadata", table_name=table_name, data=data_json)
# sqlite upsert
cursor = self.db.cursor()
cursor.execute(
"INSERT INTO _metadata (table_name, data) VALUES (?, ?) ON CONFLICT(table_name) DO UPDATE SET data = ?",
(table_name, data_json, data_json),
)
self.db.commit()
def csv_to_beaker(self, filename: str, beaker_name: str) -> None:
beaker = self.beakers[beaker_name]
lg = log.bind(beaker=beaker, filename=filename)
# three cases: empty, match, mismatch
# case 1: empty
if len(beaker) == 0:
with open(filename, "r") as file:
reader = csv.DictReader(file)
added = 0
for row in reader:
beaker.add_item(beaker.model(**row))
added += 1
lg.info("from_csv", case="empty", added=added)
meta = self.get_metadata(beaker.name)
meta["sha512"] = get_sha512(filename)
self.save_metadata(beaker.name, meta)
else:
old_sha = self.get_metadata(beaker.name).get("sha512")
new_sha = get_sha512(filename)
if old_sha != new_sha:
# case 3: mismatch
lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha)
raise Exception("sha512 mismatch")
else:
# case 2: match
lg.info("from_csv", case="match")
def show(self) -> None:
seed_count = Counter(self.seeds.keys())
typer.secho("Seeds", fg=typer.colors.GREEN)
for beaker, count in seed_count.items():
typer.secho(f" {beaker} ({count})", fg=typer.colors.GREEN)
graph_data = self.graph_data()
for node in graph_data:
if node["temp"]:
typer.secho(node["name"], fg=typer.colors.CYAN)
else:
typer.secho(
f"{node['name']} ({node['len']})",
fg=typer.colors.GREEN if node["len"] else typer.colors.YELLOW,
)
for edge in node["edges"]:
print(f" -({edge['transform'].name})-> {edge['to_beaker']}")
for k, v in edge["transform"].error_map.items():
if isinstance(k, tuple):
typer.secho(
f" {' '.join(c.__name__ for c in k)} -> {v}",
fg=typer.colors.RED,
)
else:
typer.secho(f" {k.__name__} -> {v}", fg=typer.colors.RED)
def graph_data(self) -> list[dict]:
nodes = {}
for node in networkx.topological_sort(self.graph):
beaker = self.beakers[node]
temp = isinstance(beaker, TempBeaker)
nodes[node] = {
"name": node,
"temp": temp,
"len": len(beaker),
"edges": [],
}
rank = 0
for from_b, to_b, edge in self.graph.in_edges(node, data=True):
if nodes[from_b]["rank"] > rank:
rank = nodes[from_b]["rank"]
nodes[node]["rank"] = rank + 1
for from_b, to_b, edge in self.graph.out_edges(node, data=True):
edge["to_beaker"] = to_b
nodes[node]["edges"].append(edge)
# all data collected for display
return sorted(nodes.values(), key=lambda x: (x["rank"], x["name"]))
def run_once(
self, start_beaker: str | None = None, end_beaker: str | None = None
) -> None:
log.info("run_once", recipe=self)
loop = asyncio.get_event_loop()
started = False if start_beaker else True
# go through each node in forward order, pushing data
for node in networkx.topological_sort(self.graph):
# only process nodes between start and end
if not started:
if node == start_beaker:
started = True
log.info("partial run start", node=node)
else:
log.info("partial run skip", node=node, waiting_for=start_beaker)
continue
if end_beaker and node == end_beaker:
log.info("partial run end", node=node)
break
# get outbound edges
edges = self.graph.out_edges(node, data=True)
for from_b, to_b, edge in edges:
transform = edge["transform"]
from_beaker = self.beakers[from_b]
to_beaker = self.beakers[to_b]
already_processed = from_beaker.id_set() & to_beaker.id_set()
log.info(
"transform",
from_b=from_b,
to_b=to_b,
to_process=len(from_beaker) - len(already_processed),
already_processed=len(already_processed),
transform=edge["transform"].name,
)
# convert coroutine to function
if inspect.iscoroutinefunction(transform.transform_func):
t_func = lambda x: loop.run_until_complete(
transform.transform_func(x)
)
else:
t_func = transform.transform_func
for id, item in from_beaker.items():
if id in already_processed:
continue
try:
transformed = t_func(item)
if transformed:
to_beaker.add_item(transformed, id)
except Exception as e:
for (
error_types,
error_beaker_name,
) in transform.error_map.items():
if isinstance(e, error_types):
error_beaker = self.beakers[error_beaker_name]
error_beaker.add_item(
ErrorType(
item=item,
exception=str(e),
exc_type=str(type(e)),
),
id,
)
break
else:
# no error handler, re-raise
raise

View File

@ -1,23 +0,0 @@
import uuid
from pydantic import BaseModel
class Record:
_reserved_names = ("id",)
def __init__(self, id: str | None = None):
self._id = id if id else str(uuid.uuid1())
self._data: dict[str, BaseModel] = {}
def __getattr__(self, name: str) -> str | BaseModel:
if name == "id":
return self._id
return self._data[name]
def __setattr__(self, name: str, value: BaseModel) -> None:
if name.startswith("_"):
super().__setattr__(name, value)
elif name not in self._data and name not in self._reserved_names:
self._data[name] = value
else:
raise AttributeError(f"DataObject attribute {name} already exists")

View File

@ -1,34 +0,0 @@
from beakers.record import Record
import pytest
def test_record_id_autogen():
r = Record()
assert len(r.id) == 36
r2, r3 = Record(), Record()
assert r2.id != r3.id
def test_record_id_assign():
r = Record(id="test")
assert r.id == "test"
def test_record_setattr_good():
r = Record()
r.attrib = "set"
assert r.attrib == "set"
def test_record_setattr_duplicate():
r = Record()
r.attrib = "set"
with pytest.raises(AttributeError):
r.attrib = "changed"
assert r.attrib == "set"
def test_record_setattr_id():
r = Record()
with pytest.raises(AttributeError):
r.id = "changed"