databeakers
This commit is contained in:
parent
c21954ebba
commit
b357a6d1d5
@ -1,61 +0,0 @@
|
||||
from beakers.recipe import Recipe
|
||||
import pydantic
|
||||
|
||||
|
||||
class Word(pydantic.BaseModel):
|
||||
word: str
|
||||
|
||||
|
||||
class ClassifiedWord(pydantic.BaseModel):
|
||||
normalized_word: str
|
||||
is_fruit: bool
|
||||
|
||||
|
||||
class Sentence(pydantic.BaseModel):
|
||||
sentence: list[str]
|
||||
|
||||
|
||||
def word_classifier(item) -> ClassifiedWord:
|
||||
return ClassifiedWord(
|
||||
normalized_word=item.word.lower(),
|
||||
is_fruit=item.word.lower()
|
||||
in (
|
||||
"apple",
|
||||
"banana",
|
||||
"fig",
|
||||
"grape",
|
||||
"lemon",
|
||||
"mango",
|
||||
"orange",
|
||||
"pear",
|
||||
"raspberry",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
recipe = Recipe("fruits-example")
|
||||
recipe.add_beaker("word", Word)
|
||||
recipe.add_beaker("classified_word", ClassifiedWord)
|
||||
recipe.add_beaker("sentence", Sentence)
|
||||
recipe.add_transform("word", "classified_word", word_classifier)
|
||||
recipe.add_conditional(
|
||||
"classified_word",
|
||||
lambda cw: cw.is_fruit,
|
||||
"fruits",
|
||||
)
|
||||
recipe.add_transform(
|
||||
"fruits",
|
||||
"sentence",
|
||||
lambda x: Sentence(sentence=f"I love a fresh {x.normalized_word}".split()),
|
||||
)
|
||||
|
||||
recipe.add_seed(
|
||||
"word",
|
||||
[
|
||||
Word(word="apple"),
|
||||
Word(word="bAnAnA"),
|
||||
Word(word="hammer"),
|
||||
Word(word="orange"),
|
||||
Word(word="EGG"),
|
||||
],
|
||||
)
|
245
poetry.lock
generated
245
poetry.lock
generated
@ -1,10 +1,9 @@
|
||||
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
version = "0.5.0"
|
||||
description = "Reusable constraint types to use with typing.Annotated"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -16,7 +15,6 @@ files = [
|
||||
name = "anyio"
|
||||
version = "3.7.1"
|
||||
description = "High level compatibility layer for multiple asynchronous event loop implementations"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -37,7 +35,6 @@ trio = ["trio (<0.22)"]
|
||||
name = "certifi"
|
||||
version = "2023.5.7"
|
||||
description = "Python package for providing Mozilla's CA Bundle."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
@ -49,7 +46,6 @@ files = [
|
||||
name = "charset-normalizer"
|
||||
version = "3.2.0"
|
||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
@ -130,11 +126,24 @@ files = [
|
||||
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "click"
|
||||
version = "8.1.6"
|
||||
description = "Composable command line interface toolkit"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"},
|
||||
{file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
version = "0.4.6"
|
||||
description = "Cross-platform colored terminal text."
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||
files = [
|
||||
@ -142,11 +151,32 @@ files = [
|
||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "databeakers"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
optional = false
|
||||
python-versions = "^3.10"
|
||||
files = []
|
||||
develop = true
|
||||
|
||||
[package.dependencies]
|
||||
httpx = "^0.24.0"
|
||||
networkx = "^3.1"
|
||||
pydantic = "^2.0.2"
|
||||
rich = "^13.4.2"
|
||||
scrapelib = "^2.1.0"
|
||||
structlog = "^23.1.0"
|
||||
typer = "^0.9.0"
|
||||
|
||||
[package.source]
|
||||
type = "directory"
|
||||
url = "../beakers"
|
||||
|
||||
[[package]]
|
||||
name = "h11"
|
||||
version = "0.14.0"
|
||||
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -158,7 +188,6 @@ files = [
|
||||
name = "httpcore"
|
||||
version = "0.17.3"
|
||||
description = "A minimal low-level HTTP client."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -170,17 +199,16 @@ files = [
|
||||
anyio = ">=3.0,<5.0"
|
||||
certifi = "*"
|
||||
h11 = ">=0.13,<0.15"
|
||||
sniffio = ">=1.0.0,<2.0.0"
|
||||
sniffio = "==1.*"
|
||||
|
||||
[package.extras]
|
||||
http2 = ["h2 (>=3,<5)"]
|
||||
socks = ["socksio (>=1.0.0,<2.0.0)"]
|
||||
socks = ["socksio (==1.*)"]
|
||||
|
||||
[[package]]
|
||||
name = "httpx"
|
||||
version = "0.24.1"
|
||||
description = "The next generation HTTP client."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -196,15 +224,14 @@ sniffio = "*"
|
||||
|
||||
[package.extras]
|
||||
brotli = ["brotli", "brotlicffi"]
|
||||
cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
|
||||
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
|
||||
http2 = ["h2 (>=3,<5)"]
|
||||
socks = ["socksio (>=1.0.0,<2.0.0)"]
|
||||
socks = ["socksio (==1.*)"]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.4"
|
||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
files = [
|
||||
@ -213,80 +240,44 @@ files = [
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.0.0"
|
||||
description = "brain-dead simple config-ini parsing"
|
||||
category = "dev"
|
||||
name = "markdown-it-py"
|
||||
version = "3.0.0"
|
||||
description = "Python port of markdown-it. Markdown parsing, done right!"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
|
||||
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mypy"
|
||||
version = "1.4.1"
|
||||
description = "Optional static typing for Python"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "mypy-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:566e72b0cd6598503e48ea610e0052d1b8168e60a46e0bfd34b3acf2d57f96a8"},
|
||||
{file = "mypy-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca637024ca67ab24a7fd6f65d280572c3794665eaf5edcc7e90a866544076878"},
|
||||
{file = "mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dde1d180cd84f0624c5dcaaa89c89775550a675aff96b5848de78fb11adabcd"},
|
||||
{file = "mypy-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c4d8e89aa7de683e2056a581ce63c46a0c41e31bd2b6d34144e2c80f5ea53dc"},
|
||||
{file = "mypy-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:bfdca17c36ae01a21274a3c387a63aa1aafe72bff976522886869ef131b937f1"},
|
||||
{file = "mypy-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7549fbf655e5825d787bbc9ecf6028731973f78088fbca3a1f4145c39ef09462"},
|
||||
{file = "mypy-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98324ec3ecf12296e6422939e54763faedbfcc502ea4a4c38502082711867258"},
|
||||
{file = "mypy-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141dedfdbfe8a04142881ff30ce6e6653c9685b354876b12e4fe6c78598b45e2"},
|
||||
{file = "mypy-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8207b7105829eca6f3d774f64a904190bb2231de91b8b186d21ffd98005f14a7"},
|
||||
{file = "mypy-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:16f0db5b641ba159eff72cff08edc3875f2b62b2fa2bc24f68c1e7a4e8232d01"},
|
||||
{file = "mypy-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:470c969bb3f9a9efcedbadcd19a74ffb34a25f8e6b0e02dae7c0e71f8372f97b"},
|
||||
{file = "mypy-1.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5952d2d18b79f7dc25e62e014fe5a23eb1a3d2bc66318df8988a01b1a037c5b"},
|
||||
{file = "mypy-1.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:190b6bab0302cec4e9e6767d3eb66085aef2a1cc98fe04936d8a42ed2ba77bb7"},
|
||||
{file = "mypy-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9d40652cc4fe33871ad3338581dca3297ff5f2213d0df345bcfbde5162abf0c9"},
|
||||
{file = "mypy-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01fd2e9f85622d981fd9063bfaef1aed6e336eaacca00892cd2d82801ab7c042"},
|
||||
{file = "mypy-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2460a58faeea905aeb1b9b36f5065f2dc9a9c6e4c992a6499a2360c6c74ceca3"},
|
||||
{file = "mypy-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2746d69a8196698146a3dbe29104f9eb6a2a4d8a27878d92169a6c0b74435b6"},
|
||||
{file = "mypy-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ae704dcfaa180ff7c4cfbad23e74321a2b774f92ca77fd94ce1049175a21c97f"},
|
||||
{file = "mypy-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:43d24f6437925ce50139a310a64b2ab048cb2d3694c84c71c3f2a1626d8101dc"},
|
||||
{file = "mypy-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c482e1246726616088532b5e964e39765b6d1520791348e6c9dc3af25b233828"},
|
||||
{file = "mypy-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43b592511672017f5b1a483527fd2684347fdffc041c9ef53428c8dc530f79a3"},
|
||||
{file = "mypy-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34a9239d5b3502c17f07fd7c0b2ae6b7dd7d7f6af35fbb5072c6208e76295816"},
|
||||
{file = "mypy-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5703097c4936bbb9e9bce41478c8d08edd2865e177dc4c52be759f81ee4dd26c"},
|
||||
{file = "mypy-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e02d700ec8d9b1859790c0475df4e4092c7bf3272a4fd2c9f33d87fac4427b8f"},
|
||||
{file = "mypy-1.4.1-py3-none-any.whl", hash = "sha256:45d32cec14e7b97af848bddd97d85ea4f0db4d5a149ed9676caa4eb2f7402bb4"},
|
||||
{file = "mypy-1.4.1.tar.gz", hash = "sha256:9bbcd9ab8ea1f2e1c8031c21445b511442cc45c89951e49bbf852cbb70755b1b"},
|
||||
{file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
|
||||
{file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
mypy-extensions = ">=1.0.0"
|
||||
typing-extensions = ">=4.1.0"
|
||||
mdurl = ">=0.1,<1.0"
|
||||
|
||||
[package.extras]
|
||||
dmypy = ["psutil (>=4.0)"]
|
||||
install-types = ["pip"]
|
||||
python2 = ["typed-ast (>=1.4.0,<2)"]
|
||||
reports = ["lxml"]
|
||||
benchmarking = ["psutil", "pytest", "pytest-benchmark"]
|
||||
code-style = ["pre-commit (>=3.0,<4.0)"]
|
||||
compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
|
||||
linkify = ["linkify-it-py (>=1,<3)"]
|
||||
plugins = ["mdit-py-plugins"]
|
||||
profiling = ["gprof2dot"]
|
||||
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
|
||||
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
|
||||
|
||||
[[package]]
|
||||
name = "mypy-extensions"
|
||||
version = "1.0.0"
|
||||
description = "Type system extensions for programs checked with the mypy type checker."
|
||||
category = "dev"
|
||||
name = "mdurl"
|
||||
version = "0.1.2"
|
||||
description = "Markdown URL utilities"
|
||||
optional = false
|
||||
python-versions = ">=3.5"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
|
||||
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
|
||||
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
|
||||
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "networkx"
|
||||
version = "3.1"
|
||||
description = "Python package for creating and manipulating graphs and networks"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
@ -301,39 +292,10 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
|
||||
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
|
||||
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "23.1"
|
||||
description = "Core utilities for Python packages"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
|
||||
{file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pluggy"
|
||||
version = "1.2.0"
|
||||
description = "plugin and hook calling mechanisms for python"
|
||||
category = "dev"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
|
||||
{file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["pre-commit", "tox"]
|
||||
testing = ["pytest", "pytest-benchmark"]
|
||||
|
||||
[[package]]
|
||||
name = "pydantic"
|
||||
version = "2.0.2"
|
||||
description = "Data validation using Python type hints"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -353,7 +315,6 @@ email = ["email-validator (>=2.0.0)"]
|
||||
name = "pydantic-core"
|
||||
version = "2.1.2"
|
||||
description = ""
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -464,31 +425,23 @@ files = [
|
||||
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "7.4.0"
|
||||
description = "pytest: simple powerful testing with Python"
|
||||
category = "dev"
|
||||
name = "pygments"
|
||||
version = "2.15.1"
|
||||
description = "Pygments is a syntax highlighting package written in Python."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
|
||||
{file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
|
||||
{file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"},
|
||||
{file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
||||
iniconfig = "*"
|
||||
packaging = "*"
|
||||
pluggy = ">=0.12,<2.0"
|
||||
|
||||
[package.extras]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
plugins = ["importlib-metadata"]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.31.0"
|
||||
description = "Python HTTP for Humans."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -506,11 +459,28 @@ urllib3 = ">=1.21.1,<3"
|
||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "13.5.2"
|
||||
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
files = [
|
||||
{file = "rich-13.5.2-py3-none-any.whl", hash = "sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808"},
|
||||
{file = "rich-13.5.2.tar.gz", hash = "sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
markdown-it-py = ">=2.2.0"
|
||||
pygments = ">=2.13.0,<3.0.0"
|
||||
|
||||
[package.extras]
|
||||
jupyter = ["ipywidgets (>=7.5.1,<9)"]
|
||||
|
||||
[[package]]
|
||||
name = "scrapelib"
|
||||
version = "2.2.0"
|
||||
description = ""
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7,<4.0"
|
||||
files = [
|
||||
@ -526,7 +496,6 @@ urllib3 = ">=1.26,<2.0"
|
||||
name = "sniffio"
|
||||
version = "1.3.0"
|
||||
description = "Sniff out which async library your code is running under"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -534,11 +503,48 @@ files = [
|
||||
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "structlog"
|
||||
version = "23.1.0"
|
||||
description = "Structured Logging for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "structlog-23.1.0-py3-none-any.whl", hash = "sha256:79b9e68e48b54e373441e130fa447944e6f87a05b35de23138e475c05d0f7e0e"},
|
||||
{file = "structlog-23.1.0.tar.gz", hash = "sha256:270d681dd7d163c11ba500bc914b2472d2b50a8ef00faa999ded5ff83a2f906b"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["structlog[docs,tests,typing]"]
|
||||
docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-mermaid", "twisted"]
|
||||
tests = ["coverage[toml]", "freezegun (>=0.2.8)", "pretend", "pytest (>=6.0)", "pytest-asyncio (>=0.17)", "simplejson"]
|
||||
typing = ["mypy", "rich", "twisted"]
|
||||
|
||||
[[package]]
|
||||
name = "typer"
|
||||
version = "0.9.0"
|
||||
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
|
||||
{file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
click = ">=7.1.1,<9.0.0"
|
||||
typing-extensions = ">=3.7.4.3"
|
||||
|
||||
[package.extras]
|
||||
all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
|
||||
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
|
||||
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.7.1"
|
||||
description = "Backported and Experimental Type Hints for Python 3.7+"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
@ -550,7 +556,6 @@ files = [
|
||||
name = "urllib3"
|
||||
version = "1.26.16"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||
files = [
|
||||
@ -566,4 +571,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "136157484750ab7b80e7896a5dc2905b4f9c7ed20b47dadba4e6ae45f0f6c289"
|
||||
content-hash = "a121f92ed7a60a9138f1e19bbf88220cd0a5b25c0287f031e36cec0a79188530"
|
||||
|
@ -1,27 +1,15 @@
|
||||
[tool.poetry]
|
||||
name = "beakers"
|
||||
name = "foiaghost"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["James Turk <dev@jamesturk.net>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
bkr = 'beakers.cli:app'
|
||||
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
#scrapeghost = {path = "../scrapeghost", develop = true}
|
||||
scrapelib = "^2.1.0"
|
||||
httpx = "^0.24.0"
|
||||
networkx = "^3.1"
|
||||
pydantic = "^2.0.2"
|
||||
databeakers = {path = "../beakers", develop = true}
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^7.4.0"
|
||||
mypy = "^1.4.1"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
@ -1 +0,0 @@
|
||||
from .recipe import Recipe
|
@ -1,99 +0,0 @@
|
||||
import abc
|
||||
import json
|
||||
import sqlite3
|
||||
import uuid
|
||||
from pydantic import BaseModel
|
||||
from typing import Iterable, Type, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .recipe import Recipe
|
||||
|
||||
PydanticModel = Type[BaseModel]
|
||||
|
||||
|
||||
class Beaker(abc.ABC):
|
||||
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
|
||||
self.name = name
|
||||
self.model = model
|
||||
self.recipe = recipe
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Beaker({self.name}, {self.model.__name__})"
|
||||
|
||||
@abc.abstractmethod
|
||||
def items(self) -> Iterable[tuple[str, BaseModel]]:
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def __len__(self) -> int:
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def add_item(self, item: BaseModel, id: str | None = None) -> None:
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def reset(self) -> None:
|
||||
pass
|
||||
|
||||
def add_items(self, items: Iterable[BaseModel]) -> None:
|
||||
for item in items:
|
||||
self.add_item(item)
|
||||
|
||||
def id_set(self) -> set[str]:
|
||||
return set(id for id, _ in self.items())
|
||||
|
||||
|
||||
class TempBeaker(Beaker):
|
||||
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
|
||||
super().__init__(name, model, recipe)
|
||||
self._items: list[tuple[str, BaseModel]] = []
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._items)
|
||||
|
||||
def add_item(self, item: BaseModel, id: str | None = None) -> None:
|
||||
if id is None:
|
||||
id = str(uuid.uuid1())
|
||||
self._items.append((id, item))
|
||||
|
||||
def items(self) -> Iterable[tuple[str, BaseModel]]:
|
||||
yield from self._items
|
||||
|
||||
def reset(self) -> None:
|
||||
self._items = []
|
||||
|
||||
|
||||
class SqliteBeaker(Beaker):
|
||||
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
|
||||
super().__init__(name, model, recipe)
|
||||
# create table if it doesn't exist
|
||||
self.cursor = self.recipe.db.cursor()
|
||||
self.cursor.row_factory = sqlite3.Row # type: ignore
|
||||
self.cursor.execute(
|
||||
f"CREATE TABLE IF NOT EXISTS {self.name} (uuid TEXT PRIMARY KEY, data JSON)"
|
||||
)
|
||||
|
||||
def items(self) -> Iterable[tuple[str, BaseModel]]:
|
||||
self.cursor.execute(f"SELECT uuid, data FROM {self.name}")
|
||||
data = self.cursor.fetchall()
|
||||
for item in data:
|
||||
yield item["uuid"], self.model(**json.loads(item["data"]))
|
||||
|
||||
def __len__(self) -> int:
|
||||
self.cursor.execute(f"SELECT COUNT(*) FROM {self.name}")
|
||||
return self.cursor.fetchone()[0]
|
||||
|
||||
def add_item(self, item: BaseModel, id: str | None = None) -> None:
|
||||
if id is None:
|
||||
id = str(uuid.uuid1())
|
||||
print("UUID", id, item)
|
||||
self.cursor.execute(
|
||||
f"INSERT INTO {self.name} (uuid, data) VALUES (?, ?)",
|
||||
(id, item.model_dump_json()),
|
||||
)
|
||||
self.recipe.db.commit()
|
||||
|
||||
def reset(self) -> None:
|
||||
self.cursor.execute(f"DELETE FROM {self.name}")
|
||||
self.recipe.db.commit()
|
@ -1,77 +0,0 @@
|
||||
import importlib
|
||||
from types import SimpleNamespace
|
||||
import typer
|
||||
import sys
|
||||
from pprint import pprint
|
||||
from typing import List, Optional
|
||||
from typing_extensions import Annotated
|
||||
|
||||
from beakers.beakers import SqliteBeaker
|
||||
|
||||
app = typer.Typer()
|
||||
|
||||
|
||||
def _load_recipe(dotted_path: str) -> SimpleNamespace:
|
||||
sys.path.append(".")
|
||||
path, name = dotted_path.rsplit(".", 1)
|
||||
mod = importlib.import_module(path)
|
||||
return getattr(mod, name)
|
||||
|
||||
|
||||
@app.callback()
|
||||
def main(
|
||||
ctx: typer.Context,
|
||||
recipe: str = typer.Option(None, envvar="BEAKER_RECIPE"),
|
||||
) -> None:
|
||||
if not recipe:
|
||||
typer.secho(
|
||||
"Missing recipe; pass --recipe or set env[BEAKER_RECIPE]",
|
||||
fg=typer.colors.RED,
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
ctx.obj = _load_recipe(recipe)
|
||||
|
||||
|
||||
@app.command()
|
||||
def reset(ctx: typer.Context) -> None:
|
||||
for beaker in ctx.obj.beakers.values():
|
||||
if isinstance(beaker, SqliteBeaker):
|
||||
if bl := len(beaker):
|
||||
beaker.reset()
|
||||
typer.secho(f"{beaker.name} reset ({bl})", fg=typer.colors.RED)
|
||||
else:
|
||||
typer.secho(f"{beaker.name} empty", fg=typer.colors.GREEN)
|
||||
|
||||
|
||||
@app.command()
|
||||
def show(ctx: typer.Context) -> None:
|
||||
ctx.obj.show()
|
||||
|
||||
|
||||
@app.command()
|
||||
def graph(ctx: typer.Context) -> None:
|
||||
pprint(ctx.obj.graph_data())
|
||||
|
||||
|
||||
@app.command()
|
||||
def run(
|
||||
ctx: typer.Context,
|
||||
input: Annotated[Optional[List[str]], typer.Option(...)] = None,
|
||||
start: Optional[str] = typer.Option(None),
|
||||
end: Optional[str] = typer.Option(None),
|
||||
) -> None:
|
||||
if ctx.obj.seeds:
|
||||
typer.secho("Seeding beakers", fg=typer.colors.GREEN)
|
||||
ctx.obj.process_seeds()
|
||||
has_data = any(ctx.obj.beakers.values())
|
||||
if not input and not has_data:
|
||||
typer.secho("No data; pass --input to seed beaker(s)", fg=typer.colors.RED)
|
||||
raise typer.Exit(1)
|
||||
for input_str in input or []:
|
||||
beaker, filename = input_str.split("=")
|
||||
ctx.obj.csv_to_beaker(filename, beaker)
|
||||
ctx.obj.run_once(start, end)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
@ -1,41 +0,0 @@
|
||||
import httpx
|
||||
from pydantic import BaseModel, Field
|
||||
import datetime
|
||||
|
||||
|
||||
class HttpResponse(BaseModel):
|
||||
"""
|
||||
Beaker data type that represents an HTTP response.
|
||||
"""
|
||||
|
||||
url: str
|
||||
status_code: int
|
||||
response_body: str
|
||||
retrieved_at: datetime.datetime = Field(default_factory=datetime.datetime.now)
|
||||
|
||||
|
||||
class HttpRequest:
|
||||
"""
|
||||
Filter that converts from a beaker with a URL to a beaker with an HTTP response.
|
||||
"""
|
||||
|
||||
def __init__(self, beaker: str, field: str):
|
||||
"""
|
||||
Args:
|
||||
beaker: The name of the beaker that contains the URL.
|
||||
field: The name of the field in the beaker that contains the URL.
|
||||
"""
|
||||
self.beaker = beaker
|
||||
self.field = field
|
||||
|
||||
async def __call__(self, item: BaseModel) -> HttpResponse:
|
||||
url = getattr(item, self.field)
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(url)
|
||||
|
||||
return HttpResponse(
|
||||
url=url,
|
||||
status_code=response.status_code,
|
||||
response_body=response.text,
|
||||
)
|
@ -1,319 +0,0 @@
|
||||
import csv
|
||||
import json
|
||||
import typer
|
||||
import inspect
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import asyncio
|
||||
import networkx # type: ignore
|
||||
from collections import defaultdict, Counter
|
||||
from typing import Iterable, Callable, Type
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from structlog import get_logger
|
||||
|
||||
from .beakers import Beaker, SqliteBeaker, TempBeaker
|
||||
|
||||
log = get_logger()
|
||||
|
||||
|
||||
def get_sha512(filename: str) -> str:
|
||||
with open(filename, "rb") as file:
|
||||
return hashlib.sha512(file.read()).hexdigest()
|
||||
|
||||
|
||||
class Transform(BaseModel):
|
||||
model_config = ConfigDict(frozen=True)
|
||||
|
||||
name: str
|
||||
transform_func: Callable
|
||||
error_map: dict[tuple, str]
|
||||
|
||||
|
||||
class ErrorType(BaseModel):
|
||||
item: BaseModel
|
||||
exception: str
|
||||
exc_type: str
|
||||
|
||||
|
||||
def if_cond_true(data_cond_tup: tuple[dict, bool]) -> dict | None:
|
||||
return data_cond_tup[0] if data_cond_tup[1] else None
|
||||
|
||||
|
||||
def if_cond_false(data_cond_tup: tuple[dict, bool]) -> dict | None:
|
||||
return data_cond_tup[0] if not data_cond_tup[1] else None
|
||||
|
||||
|
||||
class Recipe:
|
||||
def __init__(self, name: str, db_name: str = "beakers.db"):
|
||||
self.name = name
|
||||
self.graph = networkx.DiGraph()
|
||||
self.beakers: dict[str, Beaker] = {}
|
||||
self.seeds: defaultdict[str, list[Iterable[BaseModel]]] = defaultdict(list)
|
||||
self.db = sqlite3.connect(db_name)
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute(
|
||||
"CREATE TABLE IF NOT EXISTS _metadata (table_name TEXT PRIMARY KEY, data JSON)"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"Recipe({self.name})"
|
||||
|
||||
def add_beaker(
|
||||
self,
|
||||
name: str,
|
||||
datatype: Type[BaseModel],
|
||||
beaker_type: Type[Beaker] = SqliteBeaker,
|
||||
) -> Beaker:
|
||||
self.graph.add_node(name, datatype=datatype)
|
||||
if datatype is None:
|
||||
self.beakers[name] = TempBeaker(name, datatype, self)
|
||||
else:
|
||||
self.beakers[name] = SqliteBeaker(name, datatype, self)
|
||||
return self.beakers[name]
|
||||
|
||||
def add_transform(
|
||||
self,
|
||||
from_beaker: str,
|
||||
to_beaker: str,
|
||||
transform_func: Callable,
|
||||
*,
|
||||
name: str | None = None,
|
||||
error_map: dict[tuple, str] | None = None,
|
||||
) -> None:
|
||||
if name is None:
|
||||
name = transform_func.__name__
|
||||
if name == "<lambda>":
|
||||
name = "λ"
|
||||
transform = Transform(
|
||||
name=name,
|
||||
transform_func=transform_func,
|
||||
error_map=error_map or {},
|
||||
)
|
||||
self.graph.add_edge(
|
||||
from_beaker,
|
||||
to_beaker,
|
||||
transform=transform,
|
||||
)
|
||||
|
||||
def add_conditional(
|
||||
self,
|
||||
from_beaker: str,
|
||||
condition_func: Callable,
|
||||
if_true: str,
|
||||
if_false: str = "",
|
||||
) -> None:
|
||||
# first add a transform to evaluate the conditional
|
||||
if condition_func.__name__ == "<lambda>":
|
||||
cond_name = f"cond-{from_beaker}"
|
||||
else:
|
||||
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
|
||||
self.add_beaker(cond_name, None)
|
||||
self.add_transform(
|
||||
from_beaker,
|
||||
cond_name,
|
||||
lambda data: (data, condition_func(data)),
|
||||
name=cond_name,
|
||||
)
|
||||
|
||||
# then add two filtered paths that remove the condition result
|
||||
self.add_beaker(if_true, None)
|
||||
self.add_transform(
|
||||
cond_name,
|
||||
if_true,
|
||||
if_cond_true,
|
||||
)
|
||||
if if_false:
|
||||
self.add_transform(
|
||||
cond_name,
|
||||
if_false,
|
||||
if_cond_false,
|
||||
)
|
||||
|
||||
def add_seed(self, beaker_name: str, data: Iterable[BaseModel]) -> None:
|
||||
self.seeds[beaker_name].append(data)
|
||||
|
||||
def process_seeds(self) -> None:
|
||||
log.info("process_seeds", recipe=self.name)
|
||||
for beaker_name, seeds in self.seeds.items():
|
||||
for seed in seeds:
|
||||
self.beakers[beaker_name].add_items(seed)
|
||||
|
||||
def get_metadata(self, table_name: str) -> dict:
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute(
|
||||
"SELECT data FROM _metadata WHERE table_name = ?",
|
||||
(table_name,),
|
||||
)
|
||||
try:
|
||||
data = cursor.fetchone()["data"]
|
||||
log.debug("get_metadata", table_name=table_name, data=data)
|
||||
return json.loads(data)
|
||||
except TypeError:
|
||||
log.debug("get_metadata", table_name=table_name, data={})
|
||||
return {}
|
||||
|
||||
def save_metadata(self, table_name: str, data: dict) -> None:
|
||||
data_json = json.dumps(data)
|
||||
log.info("save_metadata", table_name=table_name, data=data_json)
|
||||
# sqlite upsert
|
||||
cursor = self.db.cursor()
|
||||
cursor.execute(
|
||||
"INSERT INTO _metadata (table_name, data) VALUES (?, ?) ON CONFLICT(table_name) DO UPDATE SET data = ?",
|
||||
(table_name, data_json, data_json),
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
def csv_to_beaker(self, filename: str, beaker_name: str) -> None:
|
||||
beaker = self.beakers[beaker_name]
|
||||
lg = log.bind(beaker=beaker, filename=filename)
|
||||
# three cases: empty, match, mismatch
|
||||
# case 1: empty
|
||||
if len(beaker) == 0:
|
||||
with open(filename, "r") as file:
|
||||
reader = csv.DictReader(file)
|
||||
added = 0
|
||||
for row in reader:
|
||||
beaker.add_item(beaker.model(**row))
|
||||
added += 1
|
||||
lg.info("from_csv", case="empty", added=added)
|
||||
meta = self.get_metadata(beaker.name)
|
||||
meta["sha512"] = get_sha512(filename)
|
||||
self.save_metadata(beaker.name, meta)
|
||||
else:
|
||||
old_sha = self.get_metadata(beaker.name).get("sha512")
|
||||
new_sha = get_sha512(filename)
|
||||
if old_sha != new_sha:
|
||||
# case 3: mismatch
|
||||
lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha)
|
||||
raise Exception("sha512 mismatch")
|
||||
else:
|
||||
# case 2: match
|
||||
lg.info("from_csv", case="match")
|
||||
|
||||
def show(self) -> None:
|
||||
seed_count = Counter(self.seeds.keys())
|
||||
typer.secho("Seeds", fg=typer.colors.GREEN)
|
||||
for beaker, count in seed_count.items():
|
||||
typer.secho(f" {beaker} ({count})", fg=typer.colors.GREEN)
|
||||
graph_data = self.graph_data()
|
||||
for node in graph_data:
|
||||
if node["temp"]:
|
||||
typer.secho(node["name"], fg=typer.colors.CYAN)
|
||||
else:
|
||||
typer.secho(
|
||||
f"{node['name']} ({node['len']})",
|
||||
fg=typer.colors.GREEN if node["len"] else typer.colors.YELLOW,
|
||||
)
|
||||
for edge in node["edges"]:
|
||||
print(f" -({edge['transform'].name})-> {edge['to_beaker']}")
|
||||
for k, v in edge["transform"].error_map.items():
|
||||
if isinstance(k, tuple):
|
||||
typer.secho(
|
||||
f" {' '.join(c.__name__ for c in k)} -> {v}",
|
||||
fg=typer.colors.RED,
|
||||
)
|
||||
else:
|
||||
typer.secho(f" {k.__name__} -> {v}", fg=typer.colors.RED)
|
||||
|
||||
def graph_data(self) -> list[dict]:
|
||||
nodes = {}
|
||||
|
||||
for node in networkx.topological_sort(self.graph):
|
||||
beaker = self.beakers[node]
|
||||
temp = isinstance(beaker, TempBeaker)
|
||||
|
||||
nodes[node] = {
|
||||
"name": node,
|
||||
"temp": temp,
|
||||
"len": len(beaker),
|
||||
"edges": [],
|
||||
}
|
||||
|
||||
rank = 0
|
||||
for from_b, to_b, edge in self.graph.in_edges(node, data=True):
|
||||
if nodes[from_b]["rank"] > rank:
|
||||
rank = nodes[from_b]["rank"]
|
||||
nodes[node]["rank"] = rank + 1
|
||||
|
||||
for from_b, to_b, edge in self.graph.out_edges(node, data=True):
|
||||
edge["to_beaker"] = to_b
|
||||
nodes[node]["edges"].append(edge)
|
||||
|
||||
# all data collected for display
|
||||
return sorted(nodes.values(), key=lambda x: (x["rank"], x["name"]))
|
||||
|
||||
def run_once(
|
||||
self, start_beaker: str | None = None, end_beaker: str | None = None
|
||||
) -> None:
|
||||
log.info("run_once", recipe=self)
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
started = False if start_beaker else True
|
||||
|
||||
# go through each node in forward order, pushing data
|
||||
for node in networkx.topological_sort(self.graph):
|
||||
# only process nodes between start and end
|
||||
if not started:
|
||||
if node == start_beaker:
|
||||
started = True
|
||||
log.info("partial run start", node=node)
|
||||
else:
|
||||
log.info("partial run skip", node=node, waiting_for=start_beaker)
|
||||
continue
|
||||
if end_beaker and node == end_beaker:
|
||||
log.info("partial run end", node=node)
|
||||
break
|
||||
|
||||
# get outbound edges
|
||||
edges = self.graph.out_edges(node, data=True)
|
||||
|
||||
for from_b, to_b, edge in edges:
|
||||
transform = edge["transform"]
|
||||
|
||||
from_beaker = self.beakers[from_b]
|
||||
to_beaker = self.beakers[to_b]
|
||||
already_processed = from_beaker.id_set() & to_beaker.id_set()
|
||||
|
||||
log.info(
|
||||
"transform",
|
||||
from_b=from_b,
|
||||
to_b=to_b,
|
||||
to_process=len(from_beaker) - len(already_processed),
|
||||
already_processed=len(already_processed),
|
||||
transform=edge["transform"].name,
|
||||
)
|
||||
|
||||
# convert coroutine to function
|
||||
if inspect.iscoroutinefunction(transform.transform_func):
|
||||
t_func = lambda x: loop.run_until_complete(
|
||||
transform.transform_func(x)
|
||||
)
|
||||
else:
|
||||
t_func = transform.transform_func
|
||||
|
||||
for id, item in from_beaker.items():
|
||||
if id in already_processed:
|
||||
continue
|
||||
try:
|
||||
transformed = t_func(item)
|
||||
if transformed:
|
||||
to_beaker.add_item(transformed, id)
|
||||
except Exception as e:
|
||||
for (
|
||||
error_types,
|
||||
error_beaker_name,
|
||||
) in transform.error_map.items():
|
||||
if isinstance(e, error_types):
|
||||
error_beaker = self.beakers[error_beaker_name]
|
||||
error_beaker.add_item(
|
||||
ErrorType(
|
||||
item=item,
|
||||
exception=str(e),
|
||||
exc_type=str(type(e)),
|
||||
),
|
||||
id,
|
||||
)
|
||||
break
|
||||
else:
|
||||
# no error handler, re-raise
|
||||
raise
|
@ -1,23 +0,0 @@
|
||||
import uuid
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class Record:
|
||||
_reserved_names = ("id",)
|
||||
|
||||
def __init__(self, id: str | None = None):
|
||||
self._id = id if id else str(uuid.uuid1())
|
||||
self._data: dict[str, BaseModel] = {}
|
||||
|
||||
def __getattr__(self, name: str) -> str | BaseModel:
|
||||
if name == "id":
|
||||
return self._id
|
||||
return self._data[name]
|
||||
|
||||
def __setattr__(self, name: str, value: BaseModel) -> None:
|
||||
if name.startswith("_"):
|
||||
super().__setattr__(name, value)
|
||||
elif name not in self._data and name not in self._reserved_names:
|
||||
self._data[name] = value
|
||||
else:
|
||||
raise AttributeError(f"DataObject attribute {name} already exists")
|
@ -1,34 +0,0 @@
|
||||
from beakers.record import Record
|
||||
import pytest
|
||||
|
||||
|
||||
def test_record_id_autogen():
|
||||
r = Record()
|
||||
assert len(r.id) == 36
|
||||
r2, r3 = Record(), Record()
|
||||
assert r2.id != r3.id
|
||||
|
||||
|
||||
def test_record_id_assign():
|
||||
r = Record(id="test")
|
||||
assert r.id == "test"
|
||||
|
||||
|
||||
def test_record_setattr_good():
|
||||
r = Record()
|
||||
r.attrib = "set"
|
||||
assert r.attrib == "set"
|
||||
|
||||
|
||||
def test_record_setattr_duplicate():
|
||||
r = Record()
|
||||
r.attrib = "set"
|
||||
with pytest.raises(AttributeError):
|
||||
r.attrib = "changed"
|
||||
assert r.attrib == "set"
|
||||
|
||||
|
||||
def test_record_setattr_id():
|
||||
r = Record()
|
||||
with pytest.raises(AttributeError):
|
||||
r.id = "changed"
|
Loading…
Reference in New Issue
Block a user