diff --git a/examples/fruits.py b/examples/fruits.py deleted file mode 100644 index c9615ed..0000000 --- a/examples/fruits.py +++ /dev/null @@ -1,61 +0,0 @@ -from beakers.recipe import Recipe -import pydantic - - -class Word(pydantic.BaseModel): - word: str - - -class ClassifiedWord(pydantic.BaseModel): - normalized_word: str - is_fruit: bool - - -class Sentence(pydantic.BaseModel): - sentence: list[str] - - -def word_classifier(item) -> ClassifiedWord: - return ClassifiedWord( - normalized_word=item.word.lower(), - is_fruit=item.word.lower() - in ( - "apple", - "banana", - "fig", - "grape", - "lemon", - "mango", - "orange", - "pear", - "raspberry", - ), - ) - - -recipe = Recipe("fruits-example") -recipe.add_beaker("word", Word) -recipe.add_beaker("classified_word", ClassifiedWord) -recipe.add_beaker("sentence", Sentence) -recipe.add_transform("word", "classified_word", word_classifier) -recipe.add_conditional( - "classified_word", - lambda cw: cw.is_fruit, - "fruits", -) -recipe.add_transform( - "fruits", - "sentence", - lambda x: Sentence(sentence=f"I love a fresh {x.normalized_word}".split()), -) - -recipe.add_seed( - "word", - [ - Word(word="apple"), - Word(word="bAnAnA"), - Word(word="hammer"), - Word(word="orange"), - Word(word="EGG"), - ], -) diff --git a/poetry.lock b/poetry.lock index 5d6c98c..0eda427 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "annotated-types" version = "0.5.0" description = "Reusable constraint types to use with typing.Annotated" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -16,7 +15,6 @@ files = [ name = "anyio" version = "3.7.1" description = "High level compatibility layer for multiple asynchronous event loop implementations" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -37,7 +35,6 @@ trio = ["trio (<0.22)"] name = "certifi" version = "2023.5.7" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -49,7 +46,6 @@ files = [ name = "charset-normalizer" version = "3.2.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -130,11 +126,24 @@ files = [ {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, ] +[[package]] +name = "click" +version = "8.1.6" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"}, + {file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -142,11 +151,32 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "databeakers" +version = "0.1.0" +description = "" +optional = false +python-versions = "^3.10" +files = [] +develop = true + +[package.dependencies] +httpx = "^0.24.0" +networkx = "^3.1" +pydantic = "^2.0.2" +rich = "^13.4.2" +scrapelib = "^2.1.0" +structlog = "^23.1.0" +typer = "^0.9.0" + +[package.source] +type = "directory" +url = "../beakers" + [[package]] name = "h11" version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -158,7 +188,6 @@ files = [ name = "httpcore" version = "0.17.3" description = "A minimal low-level HTTP client." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -170,17 +199,16 @@ files = [ anyio = ">=3.0,<5.0" certifi = "*" h11 = ">=0.13,<0.15" -sniffio = ">=1.0.0,<2.0.0" +sniffio = "==1.*" [package.extras] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "httpx" version = "0.24.1" description = "The next generation HTTP client." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -196,15 +224,14 @@ sniffio = "*" [package.extras] brotli = ["brotli", "brotlicffi"] -cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] +socks = ["socksio (==1.*)"] [[package]] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -213,80 +240,44 @@ files = [ ] [[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -category = "dev" +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, -] - -[[package]] -name = "mypy" -version = "1.4.1" -description = "Optional static typing for Python" -category = "dev" -optional = false -python-versions = ">=3.7" -files = [ - {file = "mypy-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:566e72b0cd6598503e48ea610e0052d1b8168e60a46e0bfd34b3acf2d57f96a8"}, - {file = "mypy-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca637024ca67ab24a7fd6f65d280572c3794665eaf5edcc7e90a866544076878"}, - {file = "mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dde1d180cd84f0624c5dcaaa89c89775550a675aff96b5848de78fb11adabcd"}, - {file = "mypy-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c4d8e89aa7de683e2056a581ce63c46a0c41e31bd2b6d34144e2c80f5ea53dc"}, - {file = "mypy-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:bfdca17c36ae01a21274a3c387a63aa1aafe72bff976522886869ef131b937f1"}, - {file = "mypy-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7549fbf655e5825d787bbc9ecf6028731973f78088fbca3a1f4145c39ef09462"}, - {file = "mypy-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98324ec3ecf12296e6422939e54763faedbfcc502ea4a4c38502082711867258"}, - {file = "mypy-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141dedfdbfe8a04142881ff30ce6e6653c9685b354876b12e4fe6c78598b45e2"}, - {file = "mypy-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8207b7105829eca6f3d774f64a904190bb2231de91b8b186d21ffd98005f14a7"}, - {file = "mypy-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:16f0db5b641ba159eff72cff08edc3875f2b62b2fa2bc24f68c1e7a4e8232d01"}, - {file = "mypy-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:470c969bb3f9a9efcedbadcd19a74ffb34a25f8e6b0e02dae7c0e71f8372f97b"}, - {file = "mypy-1.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5952d2d18b79f7dc25e62e014fe5a23eb1a3d2bc66318df8988a01b1a037c5b"}, - {file = "mypy-1.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:190b6bab0302cec4e9e6767d3eb66085aef2a1cc98fe04936d8a42ed2ba77bb7"}, - {file = "mypy-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9d40652cc4fe33871ad3338581dca3297ff5f2213d0df345bcfbde5162abf0c9"}, - {file = "mypy-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01fd2e9f85622d981fd9063bfaef1aed6e336eaacca00892cd2d82801ab7c042"}, - {file = "mypy-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2460a58faeea905aeb1b9b36f5065f2dc9a9c6e4c992a6499a2360c6c74ceca3"}, - {file = "mypy-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2746d69a8196698146a3dbe29104f9eb6a2a4d8a27878d92169a6c0b74435b6"}, - {file = "mypy-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ae704dcfaa180ff7c4cfbad23e74321a2b774f92ca77fd94ce1049175a21c97f"}, - {file = "mypy-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:43d24f6437925ce50139a310a64b2ab048cb2d3694c84c71c3f2a1626d8101dc"}, - {file = "mypy-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c482e1246726616088532b5e964e39765b6d1520791348e6c9dc3af25b233828"}, - {file = "mypy-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43b592511672017f5b1a483527fd2684347fdffc041c9ef53428c8dc530f79a3"}, - {file = "mypy-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34a9239d5b3502c17f07fd7c0b2ae6b7dd7d7f6af35fbb5072c6208e76295816"}, - {file = "mypy-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5703097c4936bbb9e9bce41478c8d08edd2865e177dc4c52be759f81ee4dd26c"}, - {file = "mypy-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e02d700ec8d9b1859790c0475df4e4092c7bf3272a4fd2c9f33d87fac4427b8f"}, - {file = "mypy-1.4.1-py3-none-any.whl", hash = "sha256:45d32cec14e7b97af848bddd97d85ea4f0db4d5a149ed9676caa4eb2f7402bb4"}, - {file = "mypy-1.4.1.tar.gz", hash = "sha256:9bbcd9ab8ea1f2e1c8031c21445b511442cc45c89951e49bbf852cbb70755b1b"}, + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, ] [package.dependencies] -mypy-extensions = ">=1.0.0" -typing-extensions = ">=4.1.0" +mdurl = ">=0.1,<1.0" [package.extras] -dmypy = ["psutil (>=4.0)"] -install-types = ["pip"] -python2 = ["typed-ast (>=1.4.0,<2)"] -reports = ["lxml"] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] [[package]] -name = "mypy-extensions" -version = "1.0.0" -description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" optional = false -python-versions = ">=3.5" +python-versions = ">=3.7" files = [ - {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, - {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] [[package]] name = "networkx" version = "3.1" description = "Python package for creating and manipulating graphs and networks" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -301,39 +292,10 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx- extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] -[[package]] -name = "packaging" -version = "23.1" -description = "Core utilities for Python packages" -category = "dev" -optional = false -python-versions = ">=3.7" -files = [ - {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, - {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, -] - -[[package]] -name = "pluggy" -version = "1.2.0" -description = "plugin and hook calling mechanisms for python" -category = "dev" -optional = false -python-versions = ">=3.7" -files = [ - {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, - {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, -] - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - [[package]] name = "pydantic" version = "2.0.2" description = "Data validation using Python type hints" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -353,7 +315,6 @@ email = ["email-validator (>=2.0.0)"] name = "pydantic-core" version = "2.1.2" description = "" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -464,31 +425,23 @@ files = [ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] -name = "pytest" -version = "7.4.0" -description = "pytest: simple powerful testing with Python" -category = "dev" +name = "pygments" +version = "2.15.1" +description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.7" files = [ - {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, - {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, + {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, + {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, ] -[package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=0.12,<2.0" - [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +plugins = ["importlib-metadata"] [[package]] name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -506,11 +459,28 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "rich" +version = "13.5.2" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.5.2-py3-none-any.whl", hash = "sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808"}, + {file = "rich-13.5.2.tar.gz", hash = "sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "scrapelib" version = "2.2.0" description = "" -category = "main" optional = false python-versions = ">=3.7,<4.0" files = [ @@ -526,7 +496,6 @@ urllib3 = ">=1.26,<2.0" name = "sniffio" version = "1.3.0" description = "Sniff out which async library your code is running under" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -534,11 +503,48 @@ files = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] +[[package]] +name = "structlog" +version = "23.1.0" +description = "Structured Logging for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "structlog-23.1.0-py3-none-any.whl", hash = "sha256:79b9e68e48b54e373441e130fa447944e6f87a05b35de23138e475c05d0f7e0e"}, + {file = "structlog-23.1.0.tar.gz", hash = "sha256:270d681dd7d163c11ba500bc914b2472d2b50a8ef00faa999ded5ff83a2f906b"}, +] + +[package.extras] +dev = ["structlog[docs,tests,typing]"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-mermaid", "twisted"] +tests = ["coverage[toml]", "freezegun (>=0.2.8)", "pretend", "pytest (>=6.0)", "pytest-asyncio (>=0.17)", "simplejson"] +typing = ["mypy", "rich", "twisted"] + +[[package]] +name = "typer" +version = "0.9.0" +description = "Typer, build great CLIs. Easy to code. Based on Python type hints." +optional = false +python-versions = ">=3.6" +files = [ + {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"}, + {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"}, +] + +[package.dependencies] +click = ">=7.1.1,<9.0.0" +typing-extensions = ">=3.7.4.3" + +[package.extras] +all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] +doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] +test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] + [[package]] name = "typing-extensions" version = "4.7.1" description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -550,7 +556,6 @@ files = [ name = "urllib3" version = "1.26.16" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -566,4 +571,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "136157484750ab7b80e7896a5dc2905b4f9c7ed20b47dadba4e6ae45f0f6c289" +content-hash = "a121f92ed7a60a9138f1e19bbf88220cd0a5b25c0287f031e36cec0a79188530" diff --git a/pyproject.toml b/pyproject.toml index 1ec18c4..da6f79f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,15 @@ [tool.poetry] -name = "beakers" +name = "foiaghost" version = "0.1.0" description = "" authors = ["James Turk "] readme = "README.md" -[tool.poetry.scripts] -bkr = 'beakers.cli:app' - - [tool.poetry.dependencies] python = "^3.11" -#scrapeghost = {path = "../scrapeghost", develop = true} -scrapelib = "^2.1.0" -httpx = "^0.24.0" -networkx = "^3.1" -pydantic = "^2.0.2" +databeakers = {path = "../beakers", develop = true} -[tool.poetry.group.dev.dependencies] -pytest = "^7.4.0" -mypy = "^1.4.1" - [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" diff --git a/src/beakers/__init__.py b/src/beakers/__init__.py deleted file mode 100644 index 13495de..0000000 --- a/src/beakers/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .recipe import Recipe diff --git a/src/beakers/beakers.py b/src/beakers/beakers.py deleted file mode 100644 index 8908362..0000000 --- a/src/beakers/beakers.py +++ /dev/null @@ -1,99 +0,0 @@ -import abc -import json -import sqlite3 -import uuid -from pydantic import BaseModel -from typing import Iterable, Type, TYPE_CHECKING - -if TYPE_CHECKING: - from .recipe import Recipe - -PydanticModel = Type[BaseModel] - - -class Beaker(abc.ABC): - def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"): - self.name = name - self.model = model - self.recipe = recipe - - def __repr__(self) -> str: - return f"Beaker({self.name}, {self.model.__name__})" - - @abc.abstractmethod - def items(self) -> Iterable[tuple[str, BaseModel]]: - pass - - @abc.abstractmethod - def __len__(self) -> int: - pass - - @abc.abstractmethod - def add_item(self, item: BaseModel, id: str | None = None) -> None: - pass - - @abc.abstractmethod - def reset(self) -> None: - pass - - def add_items(self, items: Iterable[BaseModel]) -> None: - for item in items: - self.add_item(item) - - def id_set(self) -> set[str]: - return set(id for id, _ in self.items()) - - -class TempBeaker(Beaker): - def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"): - super().__init__(name, model, recipe) - self._items: list[tuple[str, BaseModel]] = [] - - def __len__(self) -> int: - return len(self._items) - - def add_item(self, item: BaseModel, id: str | None = None) -> None: - if id is None: - id = str(uuid.uuid1()) - self._items.append((id, item)) - - def items(self) -> Iterable[tuple[str, BaseModel]]: - yield from self._items - - def reset(self) -> None: - self._items = [] - - -class SqliteBeaker(Beaker): - def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"): - super().__init__(name, model, recipe) - # create table if it doesn't exist - self.cursor = self.recipe.db.cursor() - self.cursor.row_factory = sqlite3.Row # type: ignore - self.cursor.execute( - f"CREATE TABLE IF NOT EXISTS {self.name} (uuid TEXT PRIMARY KEY, data JSON)" - ) - - def items(self) -> Iterable[tuple[str, BaseModel]]: - self.cursor.execute(f"SELECT uuid, data FROM {self.name}") - data = self.cursor.fetchall() - for item in data: - yield item["uuid"], self.model(**json.loads(item["data"])) - - def __len__(self) -> int: - self.cursor.execute(f"SELECT COUNT(*) FROM {self.name}") - return self.cursor.fetchone()[0] - - def add_item(self, item: BaseModel, id: str | None = None) -> None: - if id is None: - id = str(uuid.uuid1()) - print("UUID", id, item) - self.cursor.execute( - f"INSERT INTO {self.name} (uuid, data) VALUES (?, ?)", - (id, item.model_dump_json()), - ) - self.recipe.db.commit() - - def reset(self) -> None: - self.cursor.execute(f"DELETE FROM {self.name}") - self.recipe.db.commit() diff --git a/src/beakers/cli.py b/src/beakers/cli.py deleted file mode 100644 index 94bdde1..0000000 --- a/src/beakers/cli.py +++ /dev/null @@ -1,77 +0,0 @@ -import importlib -from types import SimpleNamespace -import typer -import sys -from pprint import pprint -from typing import List, Optional -from typing_extensions import Annotated - -from beakers.beakers import SqliteBeaker - -app = typer.Typer() - - -def _load_recipe(dotted_path: str) -> SimpleNamespace: - sys.path.append(".") - path, name = dotted_path.rsplit(".", 1) - mod = importlib.import_module(path) - return getattr(mod, name) - - -@app.callback() -def main( - ctx: typer.Context, - recipe: str = typer.Option(None, envvar="BEAKER_RECIPE"), -) -> None: - if not recipe: - typer.secho( - "Missing recipe; pass --recipe or set env[BEAKER_RECIPE]", - fg=typer.colors.RED, - ) - raise typer.Exit(1) - ctx.obj = _load_recipe(recipe) - - -@app.command() -def reset(ctx: typer.Context) -> None: - for beaker in ctx.obj.beakers.values(): - if isinstance(beaker, SqliteBeaker): - if bl := len(beaker): - beaker.reset() - typer.secho(f"{beaker.name} reset ({bl})", fg=typer.colors.RED) - else: - typer.secho(f"{beaker.name} empty", fg=typer.colors.GREEN) - - -@app.command() -def show(ctx: typer.Context) -> None: - ctx.obj.show() - - -@app.command() -def graph(ctx: typer.Context) -> None: - pprint(ctx.obj.graph_data()) - - -@app.command() -def run( - ctx: typer.Context, - input: Annotated[Optional[List[str]], typer.Option(...)] = None, - start: Optional[str] = typer.Option(None), - end: Optional[str] = typer.Option(None), -) -> None: - if ctx.obj.seeds: - typer.secho("Seeding beakers", fg=typer.colors.GREEN) - ctx.obj.process_seeds() - has_data = any(ctx.obj.beakers.values()) - if not input and not has_data: - typer.secho("No data; pass --input to seed beaker(s)", fg=typer.colors.RED) - raise typer.Exit(1) - for input_str in input or []: - beaker, filename = input_str.split("=") - ctx.obj.csv_to_beaker(filename, beaker) - ctx.obj.run_once(start, end) - - -if __name__ == "__main__": - app() diff --git a/src/beakers/http.py b/src/beakers/http.py deleted file mode 100644 index 1f518b5..0000000 --- a/src/beakers/http.py +++ /dev/null @@ -1,41 +0,0 @@ -import httpx -from pydantic import BaseModel, Field -import datetime - - -class HttpResponse(BaseModel): - """ - Beaker data type that represents an HTTP response. - """ - - url: str - status_code: int - response_body: str - retrieved_at: datetime.datetime = Field(default_factory=datetime.datetime.now) - - -class HttpRequest: - """ - Filter that converts from a beaker with a URL to a beaker with an HTTP response. - """ - - def __init__(self, beaker: str, field: str): - """ - Args: - beaker: The name of the beaker that contains the URL. - field: The name of the field in the beaker that contains the URL. - """ - self.beaker = beaker - self.field = field - - async def __call__(self, item: BaseModel) -> HttpResponse: - url = getattr(item, self.field) - - async with httpx.AsyncClient() as client: - response = await client.get(url) - - return HttpResponse( - url=url, - status_code=response.status_code, - response_body=response.text, - ) diff --git a/src/beakers/recipe.py b/src/beakers/recipe.py deleted file mode 100644 index 9ff1c1b..0000000 --- a/src/beakers/recipe.py +++ /dev/null @@ -1,319 +0,0 @@ -import csv -import json -import typer -import inspect -import sqlite3 -import hashlib -import asyncio -import networkx # type: ignore -from collections import defaultdict, Counter -from typing import Iterable, Callable, Type -from pydantic import BaseModel, ConfigDict -from structlog import get_logger - -from .beakers import Beaker, SqliteBeaker, TempBeaker - -log = get_logger() - - -def get_sha512(filename: str) -> str: - with open(filename, "rb") as file: - return hashlib.sha512(file.read()).hexdigest() - - -class Transform(BaseModel): - model_config = ConfigDict(frozen=True) - - name: str - transform_func: Callable - error_map: dict[tuple, str] - - -class ErrorType(BaseModel): - item: BaseModel - exception: str - exc_type: str - - -def if_cond_true(data_cond_tup: tuple[dict, bool]) -> dict | None: - return data_cond_tup[0] if data_cond_tup[1] else None - - -def if_cond_false(data_cond_tup: tuple[dict, bool]) -> dict | None: - return data_cond_tup[0] if not data_cond_tup[1] else None - - -class Recipe: - def __init__(self, name: str, db_name: str = "beakers.db"): - self.name = name - self.graph = networkx.DiGraph() - self.beakers: dict[str, Beaker] = {} - self.seeds: defaultdict[str, list[Iterable[BaseModel]]] = defaultdict(list) - self.db = sqlite3.connect(db_name) - cursor = self.db.cursor() - cursor.execute( - "CREATE TABLE IF NOT EXISTS _metadata (table_name TEXT PRIMARY KEY, data JSON)" - ) - - def __repr__(self) -> str: - return f"Recipe({self.name})" - - def add_beaker( - self, - name: str, - datatype: Type[BaseModel], - beaker_type: Type[Beaker] = SqliteBeaker, - ) -> Beaker: - self.graph.add_node(name, datatype=datatype) - if datatype is None: - self.beakers[name] = TempBeaker(name, datatype, self) - else: - self.beakers[name] = SqliteBeaker(name, datatype, self) - return self.beakers[name] - - def add_transform( - self, - from_beaker: str, - to_beaker: str, - transform_func: Callable, - *, - name: str | None = None, - error_map: dict[tuple, str] | None = None, - ) -> None: - if name is None: - name = transform_func.__name__ - if name == "": - name = "λ" - transform = Transform( - name=name, - transform_func=transform_func, - error_map=error_map or {}, - ) - self.graph.add_edge( - from_beaker, - to_beaker, - transform=transform, - ) - - def add_conditional( - self, - from_beaker: str, - condition_func: Callable, - if_true: str, - if_false: str = "", - ) -> None: - # first add a transform to evaluate the conditional - if condition_func.__name__ == "": - cond_name = f"cond-{from_beaker}" - else: - cond_name = f"cond-{from_beaker}-{condition_func.__name__}" - self.add_beaker(cond_name, None) - self.add_transform( - from_beaker, - cond_name, - lambda data: (data, condition_func(data)), - name=cond_name, - ) - - # then add two filtered paths that remove the condition result - self.add_beaker(if_true, None) - self.add_transform( - cond_name, - if_true, - if_cond_true, - ) - if if_false: - self.add_transform( - cond_name, - if_false, - if_cond_false, - ) - - def add_seed(self, beaker_name: str, data: Iterable[BaseModel]) -> None: - self.seeds[beaker_name].append(data) - - def process_seeds(self) -> None: - log.info("process_seeds", recipe=self.name) - for beaker_name, seeds in self.seeds.items(): - for seed in seeds: - self.beakers[beaker_name].add_items(seed) - - def get_metadata(self, table_name: str) -> dict: - cursor = self.db.cursor() - cursor.execute( - "SELECT data FROM _metadata WHERE table_name = ?", - (table_name,), - ) - try: - data = cursor.fetchone()["data"] - log.debug("get_metadata", table_name=table_name, data=data) - return json.loads(data) - except TypeError: - log.debug("get_metadata", table_name=table_name, data={}) - return {} - - def save_metadata(self, table_name: str, data: dict) -> None: - data_json = json.dumps(data) - log.info("save_metadata", table_name=table_name, data=data_json) - # sqlite upsert - cursor = self.db.cursor() - cursor.execute( - "INSERT INTO _metadata (table_name, data) VALUES (?, ?) ON CONFLICT(table_name) DO UPDATE SET data = ?", - (table_name, data_json, data_json), - ) - self.db.commit() - - def csv_to_beaker(self, filename: str, beaker_name: str) -> None: - beaker = self.beakers[beaker_name] - lg = log.bind(beaker=beaker, filename=filename) - # three cases: empty, match, mismatch - # case 1: empty - if len(beaker) == 0: - with open(filename, "r") as file: - reader = csv.DictReader(file) - added = 0 - for row in reader: - beaker.add_item(beaker.model(**row)) - added += 1 - lg.info("from_csv", case="empty", added=added) - meta = self.get_metadata(beaker.name) - meta["sha512"] = get_sha512(filename) - self.save_metadata(beaker.name, meta) - else: - old_sha = self.get_metadata(beaker.name).get("sha512") - new_sha = get_sha512(filename) - if old_sha != new_sha: - # case 3: mismatch - lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha) - raise Exception("sha512 mismatch") - else: - # case 2: match - lg.info("from_csv", case="match") - - def show(self) -> None: - seed_count = Counter(self.seeds.keys()) - typer.secho("Seeds", fg=typer.colors.GREEN) - for beaker, count in seed_count.items(): - typer.secho(f" {beaker} ({count})", fg=typer.colors.GREEN) - graph_data = self.graph_data() - for node in graph_data: - if node["temp"]: - typer.secho(node["name"], fg=typer.colors.CYAN) - else: - typer.secho( - f"{node['name']} ({node['len']})", - fg=typer.colors.GREEN if node["len"] else typer.colors.YELLOW, - ) - for edge in node["edges"]: - print(f" -({edge['transform'].name})-> {edge['to_beaker']}") - for k, v in edge["transform"].error_map.items(): - if isinstance(k, tuple): - typer.secho( - f" {' '.join(c.__name__ for c in k)} -> {v}", - fg=typer.colors.RED, - ) - else: - typer.secho(f" {k.__name__} -> {v}", fg=typer.colors.RED) - - def graph_data(self) -> list[dict]: - nodes = {} - - for node in networkx.topological_sort(self.graph): - beaker = self.beakers[node] - temp = isinstance(beaker, TempBeaker) - - nodes[node] = { - "name": node, - "temp": temp, - "len": len(beaker), - "edges": [], - } - - rank = 0 - for from_b, to_b, edge in self.graph.in_edges(node, data=True): - if nodes[from_b]["rank"] > rank: - rank = nodes[from_b]["rank"] - nodes[node]["rank"] = rank + 1 - - for from_b, to_b, edge in self.graph.out_edges(node, data=True): - edge["to_beaker"] = to_b - nodes[node]["edges"].append(edge) - - # all data collected for display - return sorted(nodes.values(), key=lambda x: (x["rank"], x["name"])) - - def run_once( - self, start_beaker: str | None = None, end_beaker: str | None = None - ) -> None: - log.info("run_once", recipe=self) - loop = asyncio.get_event_loop() - - started = False if start_beaker else True - - # go through each node in forward order, pushing data - for node in networkx.topological_sort(self.graph): - # only process nodes between start and end - if not started: - if node == start_beaker: - started = True - log.info("partial run start", node=node) - else: - log.info("partial run skip", node=node, waiting_for=start_beaker) - continue - if end_beaker and node == end_beaker: - log.info("partial run end", node=node) - break - - # get outbound edges - edges = self.graph.out_edges(node, data=True) - - for from_b, to_b, edge in edges: - transform = edge["transform"] - - from_beaker = self.beakers[from_b] - to_beaker = self.beakers[to_b] - already_processed = from_beaker.id_set() & to_beaker.id_set() - - log.info( - "transform", - from_b=from_b, - to_b=to_b, - to_process=len(from_beaker) - len(already_processed), - already_processed=len(already_processed), - transform=edge["transform"].name, - ) - - # convert coroutine to function - if inspect.iscoroutinefunction(transform.transform_func): - t_func = lambda x: loop.run_until_complete( - transform.transform_func(x) - ) - else: - t_func = transform.transform_func - - for id, item in from_beaker.items(): - if id in already_processed: - continue - try: - transformed = t_func(item) - if transformed: - to_beaker.add_item(transformed, id) - except Exception as e: - for ( - error_types, - error_beaker_name, - ) in transform.error_map.items(): - if isinstance(e, error_types): - error_beaker = self.beakers[error_beaker_name] - error_beaker.add_item( - ErrorType( - item=item, - exception=str(e), - exc_type=str(type(e)), - ), - id, - ) - break - else: - # no error handler, re-raise - raise diff --git a/src/beakers/record.py b/src/beakers/record.py deleted file mode 100644 index 5c358d6..0000000 --- a/src/beakers/record.py +++ /dev/null @@ -1,23 +0,0 @@ -import uuid -from pydantic import BaseModel - - -class Record: - _reserved_names = ("id",) - - def __init__(self, id: str | None = None): - self._id = id if id else str(uuid.uuid1()) - self._data: dict[str, BaseModel] = {} - - def __getattr__(self, name: str) -> str | BaseModel: - if name == "id": - return self._id - return self._data[name] - - def __setattr__(self, name: str, value: BaseModel) -> None: - if name.startswith("_"): - super().__setattr__(name, value) - elif name not in self._data and name not in self._reserved_names: - self._data[name] = value - else: - raise AttributeError(f"DataObject attribute {name} already exists") diff --git a/tests/test_record.py b/tests/test_record.py deleted file mode 100644 index d30c570..0000000 --- a/tests/test_record.py +++ /dev/null @@ -1,34 +0,0 @@ -from beakers.record import Record -import pytest - - -def test_record_id_autogen(): - r = Record() - assert len(r.id) == 36 - r2, r3 = Record(), Record() - assert r2.id != r3.id - - -def test_record_id_assign(): - r = Record(id="test") - assert r.id == "test" - - -def test_record_setattr_good(): - r = Record() - r.attrib = "set" - assert r.attrib == "set" - - -def test_record_setattr_duplicate(): - r = Record() - r.attrib = "set" - with pytest.raises(AttributeError): - r.attrib = "changed" - assert r.attrib == "set" - - -def test_record_setattr_id(): - r = Record() - with pytest.raises(AttributeError): - r.id = "changed"