databeakers
This commit is contained in:
parent
c21954ebba
commit
b357a6d1d5
@ -1,61 +0,0 @@
|
|||||||
from beakers.recipe import Recipe
|
|
||||||
import pydantic
|
|
||||||
|
|
||||||
|
|
||||||
class Word(pydantic.BaseModel):
|
|
||||||
word: str
|
|
||||||
|
|
||||||
|
|
||||||
class ClassifiedWord(pydantic.BaseModel):
|
|
||||||
normalized_word: str
|
|
||||||
is_fruit: bool
|
|
||||||
|
|
||||||
|
|
||||||
class Sentence(pydantic.BaseModel):
|
|
||||||
sentence: list[str]
|
|
||||||
|
|
||||||
|
|
||||||
def word_classifier(item) -> ClassifiedWord:
|
|
||||||
return ClassifiedWord(
|
|
||||||
normalized_word=item.word.lower(),
|
|
||||||
is_fruit=item.word.lower()
|
|
||||||
in (
|
|
||||||
"apple",
|
|
||||||
"banana",
|
|
||||||
"fig",
|
|
||||||
"grape",
|
|
||||||
"lemon",
|
|
||||||
"mango",
|
|
||||||
"orange",
|
|
||||||
"pear",
|
|
||||||
"raspberry",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
recipe = Recipe("fruits-example")
|
|
||||||
recipe.add_beaker("word", Word)
|
|
||||||
recipe.add_beaker("classified_word", ClassifiedWord)
|
|
||||||
recipe.add_beaker("sentence", Sentence)
|
|
||||||
recipe.add_transform("word", "classified_word", word_classifier)
|
|
||||||
recipe.add_conditional(
|
|
||||||
"classified_word",
|
|
||||||
lambda cw: cw.is_fruit,
|
|
||||||
"fruits",
|
|
||||||
)
|
|
||||||
recipe.add_transform(
|
|
||||||
"fruits",
|
|
||||||
"sentence",
|
|
||||||
lambda x: Sentence(sentence=f"I love a fresh {x.normalized_word}".split()),
|
|
||||||
)
|
|
||||||
|
|
||||||
recipe.add_seed(
|
|
||||||
"word",
|
|
||||||
[
|
|
||||||
Word(word="apple"),
|
|
||||||
Word(word="bAnAnA"),
|
|
||||||
Word(word="hammer"),
|
|
||||||
Word(word="orange"),
|
|
||||||
Word(word="EGG"),
|
|
||||||
],
|
|
||||||
)
|
|
245
poetry.lock
generated
245
poetry.lock
generated
@ -1,10 +1,9 @@
|
|||||||
# This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "annotated-types"
|
name = "annotated-types"
|
||||||
version = "0.5.0"
|
version = "0.5.0"
|
||||||
description = "Reusable constraint types to use with typing.Annotated"
|
description = "Reusable constraint types to use with typing.Annotated"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -16,7 +15,6 @@ files = [
|
|||||||
name = "anyio"
|
name = "anyio"
|
||||||
version = "3.7.1"
|
version = "3.7.1"
|
||||||
description = "High level compatibility layer for multiple asynchronous event loop implementations"
|
description = "High level compatibility layer for multiple asynchronous event loop implementations"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -37,7 +35,6 @@ trio = ["trio (<0.22)"]
|
|||||||
name = "certifi"
|
name = "certifi"
|
||||||
version = "2023.5.7"
|
version = "2023.5.7"
|
||||||
description = "Python package for providing Mozilla's CA Bundle."
|
description = "Python package for providing Mozilla's CA Bundle."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.6"
|
python-versions = ">=3.6"
|
||||||
files = [
|
files = [
|
||||||
@ -49,7 +46,6 @@ files = [
|
|||||||
name = "charset-normalizer"
|
name = "charset-normalizer"
|
||||||
version = "3.2.0"
|
version = "3.2.0"
|
||||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7.0"
|
python-versions = ">=3.7.0"
|
||||||
files = [
|
files = [
|
||||||
@ -130,11 +126,24 @@ files = [
|
|||||||
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "click"
|
||||||
|
version = "8.1.6"
|
||||||
|
description = "Composable command line interface toolkit"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"},
|
||||||
|
{file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
colorama = {version = "*", markers = "platform_system == \"Windows\""}
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "colorama"
|
name = "colorama"
|
||||||
version = "0.4.6"
|
version = "0.4.6"
|
||||||
description = "Cross-platform colored terminal text."
|
description = "Cross-platform colored terminal text."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
|
||||||
files = [
|
files = [
|
||||||
@ -142,11 +151,32 @@ files = [
|
|||||||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "databeakers"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
optional = false
|
||||||
|
python-versions = "^3.10"
|
||||||
|
files = []
|
||||||
|
develop = true
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
httpx = "^0.24.0"
|
||||||
|
networkx = "^3.1"
|
||||||
|
pydantic = "^2.0.2"
|
||||||
|
rich = "^13.4.2"
|
||||||
|
scrapelib = "^2.1.0"
|
||||||
|
structlog = "^23.1.0"
|
||||||
|
typer = "^0.9.0"
|
||||||
|
|
||||||
|
[package.source]
|
||||||
|
type = "directory"
|
||||||
|
url = "../beakers"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "h11"
|
name = "h11"
|
||||||
version = "0.14.0"
|
version = "0.14.0"
|
||||||
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
|
description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -158,7 +188,6 @@ files = [
|
|||||||
name = "httpcore"
|
name = "httpcore"
|
||||||
version = "0.17.3"
|
version = "0.17.3"
|
||||||
description = "A minimal low-level HTTP client."
|
description = "A minimal low-level HTTP client."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -170,17 +199,16 @@ files = [
|
|||||||
anyio = ">=3.0,<5.0"
|
anyio = ">=3.0,<5.0"
|
||||||
certifi = "*"
|
certifi = "*"
|
||||||
h11 = ">=0.13,<0.15"
|
h11 = ">=0.13,<0.15"
|
||||||
sniffio = ">=1.0.0,<2.0.0"
|
sniffio = "==1.*"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
http2 = ["h2 (>=3,<5)"]
|
http2 = ["h2 (>=3,<5)"]
|
||||||
socks = ["socksio (>=1.0.0,<2.0.0)"]
|
socks = ["socksio (==1.*)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "httpx"
|
name = "httpx"
|
||||||
version = "0.24.1"
|
version = "0.24.1"
|
||||||
description = "The next generation HTTP client."
|
description = "The next generation HTTP client."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -196,15 +224,14 @@ sniffio = "*"
|
|||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
brotli = ["brotli", "brotlicffi"]
|
brotli = ["brotli", "brotlicffi"]
|
||||||
cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"]
|
cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
|
||||||
http2 = ["h2 (>=3,<5)"]
|
http2 = ["h2 (>=3,<5)"]
|
||||||
socks = ["socksio (>=1.0.0,<2.0.0)"]
|
socks = ["socksio (==1.*)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "3.4"
|
version = "3.4"
|
||||||
description = "Internationalized Domain Names in Applications (IDNA)"
|
description = "Internationalized Domain Names in Applications (IDNA)"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.5"
|
||||||
files = [
|
files = [
|
||||||
@ -213,80 +240,44 @@ files = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iniconfig"
|
name = "markdown-it-py"
|
||||||
version = "2.0.0"
|
version = "3.0.0"
|
||||||
description = "brain-dead simple config-ini parsing"
|
description = "Python port of markdown-it. Markdown parsing, done right!"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
|
{file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"},
|
||||||
{file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
|
{file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"},
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "mypy"
|
|
||||||
version = "1.4.1"
|
|
||||||
description = "Optional static typing for Python"
|
|
||||||
category = "dev"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "mypy-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:566e72b0cd6598503e48ea610e0052d1b8168e60a46e0bfd34b3acf2d57f96a8"},
|
|
||||||
{file = "mypy-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca637024ca67ab24a7fd6f65d280572c3794665eaf5edcc7e90a866544076878"},
|
|
||||||
{file = "mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dde1d180cd84f0624c5dcaaa89c89775550a675aff96b5848de78fb11adabcd"},
|
|
||||||
{file = "mypy-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c4d8e89aa7de683e2056a581ce63c46a0c41e31bd2b6d34144e2c80f5ea53dc"},
|
|
||||||
{file = "mypy-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:bfdca17c36ae01a21274a3c387a63aa1aafe72bff976522886869ef131b937f1"},
|
|
||||||
{file = "mypy-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7549fbf655e5825d787bbc9ecf6028731973f78088fbca3a1f4145c39ef09462"},
|
|
||||||
{file = "mypy-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98324ec3ecf12296e6422939e54763faedbfcc502ea4a4c38502082711867258"},
|
|
||||||
{file = "mypy-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141dedfdbfe8a04142881ff30ce6e6653c9685b354876b12e4fe6c78598b45e2"},
|
|
||||||
{file = "mypy-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8207b7105829eca6f3d774f64a904190bb2231de91b8b186d21ffd98005f14a7"},
|
|
||||||
{file = "mypy-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:16f0db5b641ba159eff72cff08edc3875f2b62b2fa2bc24f68c1e7a4e8232d01"},
|
|
||||||
{file = "mypy-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:470c969bb3f9a9efcedbadcd19a74ffb34a25f8e6b0e02dae7c0e71f8372f97b"},
|
|
||||||
{file = "mypy-1.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5952d2d18b79f7dc25e62e014fe5a23eb1a3d2bc66318df8988a01b1a037c5b"},
|
|
||||||
{file = "mypy-1.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:190b6bab0302cec4e9e6767d3eb66085aef2a1cc98fe04936d8a42ed2ba77bb7"},
|
|
||||||
{file = "mypy-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9d40652cc4fe33871ad3338581dca3297ff5f2213d0df345bcfbde5162abf0c9"},
|
|
||||||
{file = "mypy-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01fd2e9f85622d981fd9063bfaef1aed6e336eaacca00892cd2d82801ab7c042"},
|
|
||||||
{file = "mypy-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2460a58faeea905aeb1b9b36f5065f2dc9a9c6e4c992a6499a2360c6c74ceca3"},
|
|
||||||
{file = "mypy-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2746d69a8196698146a3dbe29104f9eb6a2a4d8a27878d92169a6c0b74435b6"},
|
|
||||||
{file = "mypy-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ae704dcfaa180ff7c4cfbad23e74321a2b774f92ca77fd94ce1049175a21c97f"},
|
|
||||||
{file = "mypy-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:43d24f6437925ce50139a310a64b2ab048cb2d3694c84c71c3f2a1626d8101dc"},
|
|
||||||
{file = "mypy-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c482e1246726616088532b5e964e39765b6d1520791348e6c9dc3af25b233828"},
|
|
||||||
{file = "mypy-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43b592511672017f5b1a483527fd2684347fdffc041c9ef53428c8dc530f79a3"},
|
|
||||||
{file = "mypy-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34a9239d5b3502c17f07fd7c0b2ae6b7dd7d7f6af35fbb5072c6208e76295816"},
|
|
||||||
{file = "mypy-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5703097c4936bbb9e9bce41478c8d08edd2865e177dc4c52be759f81ee4dd26c"},
|
|
||||||
{file = "mypy-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e02d700ec8d9b1859790c0475df4e4092c7bf3272a4fd2c9f33d87fac4427b8f"},
|
|
||||||
{file = "mypy-1.4.1-py3-none-any.whl", hash = "sha256:45d32cec14e7b97af848bddd97d85ea4f0db4d5a149ed9676caa4eb2f7402bb4"},
|
|
||||||
{file = "mypy-1.4.1.tar.gz", hash = "sha256:9bbcd9ab8ea1f2e1c8031c21445b511442cc45c89951e49bbf852cbb70755b1b"},
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
mypy-extensions = ">=1.0.0"
|
mdurl = ">=0.1,<1.0"
|
||||||
typing-extensions = ">=4.1.0"
|
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
dmypy = ["psutil (>=4.0)"]
|
benchmarking = ["psutil", "pytest", "pytest-benchmark"]
|
||||||
install-types = ["pip"]
|
code-style = ["pre-commit (>=3.0,<4.0)"]
|
||||||
python2 = ["typed-ast (>=1.4.0,<2)"]
|
compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"]
|
||||||
reports = ["lxml"]
|
linkify = ["linkify-it-py (>=1,<3)"]
|
||||||
|
plugins = ["mdit-py-plugins"]
|
||||||
|
profiling = ["gprof2dot"]
|
||||||
|
rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"]
|
||||||
|
testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mypy-extensions"
|
name = "mdurl"
|
||||||
version = "1.0.0"
|
version = "0.1.2"
|
||||||
description = "Type system extensions for programs checked with the mypy type checker."
|
description = "Markdown URL utilities"
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.5"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
|
{file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"},
|
||||||
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
|
{file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "networkx"
|
name = "networkx"
|
||||||
version = "3.1"
|
version = "3.1"
|
||||||
description = "Python package for creating and manipulating graphs and networks"
|
description = "Python package for creating and manipulating graphs and networks"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
@ -301,39 +292,10 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-
|
|||||||
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
|
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"]
|
||||||
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "packaging"
|
|
||||||
version = "23.1"
|
|
||||||
description = "Core utilities for Python packages"
|
|
||||||
category = "dev"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
|
|
||||||
{file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "pluggy"
|
|
||||||
version = "1.2.0"
|
|
||||||
description = "plugin and hook calling mechanisms for python"
|
|
||||||
category = "dev"
|
|
||||||
optional = false
|
|
||||||
python-versions = ">=3.7"
|
|
||||||
files = [
|
|
||||||
{file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
|
|
||||||
{file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.extras]
|
|
||||||
dev = ["pre-commit", "tox"]
|
|
||||||
testing = ["pytest", "pytest-benchmark"]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pydantic"
|
name = "pydantic"
|
||||||
version = "2.0.2"
|
version = "2.0.2"
|
||||||
description = "Data validation using Python type hints"
|
description = "Data validation using Python type hints"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -353,7 +315,6 @@ email = ["email-validator (>=2.0.0)"]
|
|||||||
name = "pydantic-core"
|
name = "pydantic-core"
|
||||||
version = "2.1.2"
|
version = "2.1.2"
|
||||||
description = ""
|
description = ""
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -464,31 +425,23 @@ files = [
|
|||||||
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
|
typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pytest"
|
name = "pygments"
|
||||||
version = "7.4.0"
|
version = "2.15.1"
|
||||||
description = "pytest: simple powerful testing with Python"
|
description = "Pygments is a syntax highlighting package written in Python."
|
||||||
category = "dev"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"},
|
{file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"},
|
||||||
{file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"},
|
{file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
colorama = {version = "*", markers = "sys_platform == \"win32\""}
|
|
||||||
iniconfig = "*"
|
|
||||||
packaging = "*"
|
|
||||||
pluggy = ">=0.12,<2.0"
|
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
plugins = ["importlib-metadata"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "requests"
|
name = "requests"
|
||||||
version = "2.31.0"
|
version = "2.31.0"
|
||||||
description = "Python HTTP for Humans."
|
description = "Python HTTP for Humans."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -506,11 +459,28 @@ urllib3 = ">=1.21.1,<3"
|
|||||||
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
|
||||||
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rich"
|
||||||
|
version = "13.5.2"
|
||||||
|
description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7.0"
|
||||||
|
files = [
|
||||||
|
{file = "rich-13.5.2-py3-none-any.whl", hash = "sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808"},
|
||||||
|
{file = "rich-13.5.2.tar.gz", hash = "sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
markdown-it-py = ">=2.2.0"
|
||||||
|
pygments = ">=2.13.0,<3.0.0"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
jupyter = ["ipywidgets (>=7.5.1,<9)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "scrapelib"
|
name = "scrapelib"
|
||||||
version = "2.2.0"
|
version = "2.2.0"
|
||||||
description = ""
|
description = ""
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7,<4.0"
|
python-versions = ">=3.7,<4.0"
|
||||||
files = [
|
files = [
|
||||||
@ -526,7 +496,6 @@ urllib3 = ">=1.26,<2.0"
|
|||||||
name = "sniffio"
|
name = "sniffio"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
description = "Sniff out which async library your code is running under"
|
description = "Sniff out which async library your code is running under"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -534,11 +503,48 @@ files = [
|
|||||||
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
{file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "structlog"
|
||||||
|
version = "23.1.0"
|
||||||
|
description = "Structured Logging for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.7"
|
||||||
|
files = [
|
||||||
|
{file = "structlog-23.1.0-py3-none-any.whl", hash = "sha256:79b9e68e48b54e373441e130fa447944e6f87a05b35de23138e475c05d0f7e0e"},
|
||||||
|
{file = "structlog-23.1.0.tar.gz", hash = "sha256:270d681dd7d163c11ba500bc914b2472d2b50a8ef00faa999ded5ff83a2f906b"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
dev = ["structlog[docs,tests,typing]"]
|
||||||
|
docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-mermaid", "twisted"]
|
||||||
|
tests = ["coverage[toml]", "freezegun (>=0.2.8)", "pretend", "pytest (>=6.0)", "pytest-asyncio (>=0.17)", "simplejson"]
|
||||||
|
typing = ["mypy", "rich", "twisted"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "typer"
|
||||||
|
version = "0.9.0"
|
||||||
|
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"},
|
||||||
|
{file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
click = ">=7.1.1,<9.0.0"
|
||||||
|
typing-extensions = ">=3.7.4.3"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||||
|
dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"]
|
||||||
|
doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"]
|
||||||
|
test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typing-extensions"
|
name = "typing-extensions"
|
||||||
version = "4.7.1"
|
version = "4.7.1"
|
||||||
description = "Backported and Experimental Type Hints for Python 3.7+"
|
description = "Backported and Experimental Type Hints for Python 3.7+"
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
@ -550,7 +556,6 @@ files = [
|
|||||||
name = "urllib3"
|
name = "urllib3"
|
||||||
version = "1.26.16"
|
version = "1.26.16"
|
||||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||||
category = "main"
|
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||||
files = [
|
files = [
|
||||||
@ -566,4 +571,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11"
|
python-versions = "^3.11"
|
||||||
content-hash = "136157484750ab7b80e7896a5dc2905b4f9c7ed20b47dadba4e6ae45f0f6c289"
|
content-hash = "a121f92ed7a60a9138f1e19bbf88220cd0a5b25c0287f031e36cec0a79188530"
|
||||||
|
@ -1,27 +1,15 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "beakers"
|
name = "foiaghost"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
description = ""
|
description = ""
|
||||||
authors = ["James Turk <dev@jamesturk.net>"]
|
authors = ["James Turk <dev@jamesturk.net>"]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
|
||||||
bkr = 'beakers.cli:app'
|
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.11"
|
python = "^3.11"
|
||||||
#scrapeghost = {path = "../scrapeghost", develop = true}
|
databeakers = {path = "../beakers", develop = true}
|
||||||
scrapelib = "^2.1.0"
|
|
||||||
httpx = "^0.24.0"
|
|
||||||
networkx = "^3.1"
|
|
||||||
pydantic = "^2.0.2"
|
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
|
||||||
pytest = "^7.4.0"
|
|
||||||
mypy = "^1.4.1"
|
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
@ -1 +0,0 @@
|
|||||||
from .recipe import Recipe
|
|
@ -1,99 +0,0 @@
|
|||||||
import abc
|
|
||||||
import json
|
|
||||||
import sqlite3
|
|
||||||
import uuid
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from typing import Iterable, Type, TYPE_CHECKING
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from .recipe import Recipe
|
|
||||||
|
|
||||||
PydanticModel = Type[BaseModel]
|
|
||||||
|
|
||||||
|
|
||||||
class Beaker(abc.ABC):
|
|
||||||
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
|
|
||||||
self.name = name
|
|
||||||
self.model = model
|
|
||||||
self.recipe = recipe
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"Beaker({self.name}, {self.model.__name__})"
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def items(self) -> Iterable[tuple[str, BaseModel]]:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def __len__(self) -> int:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def add_item(self, item: BaseModel, id: str | None = None) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def reset(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def add_items(self, items: Iterable[BaseModel]) -> None:
|
|
||||||
for item in items:
|
|
||||||
self.add_item(item)
|
|
||||||
|
|
||||||
def id_set(self) -> set[str]:
|
|
||||||
return set(id for id, _ in self.items())
|
|
||||||
|
|
||||||
|
|
||||||
class TempBeaker(Beaker):
|
|
||||||
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
|
|
||||||
super().__init__(name, model, recipe)
|
|
||||||
self._items: list[tuple[str, BaseModel]] = []
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
return len(self._items)
|
|
||||||
|
|
||||||
def add_item(self, item: BaseModel, id: str | None = None) -> None:
|
|
||||||
if id is None:
|
|
||||||
id = str(uuid.uuid1())
|
|
||||||
self._items.append((id, item))
|
|
||||||
|
|
||||||
def items(self) -> Iterable[tuple[str, BaseModel]]:
|
|
||||||
yield from self._items
|
|
||||||
|
|
||||||
def reset(self) -> None:
|
|
||||||
self._items = []
|
|
||||||
|
|
||||||
|
|
||||||
class SqliteBeaker(Beaker):
|
|
||||||
def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"):
|
|
||||||
super().__init__(name, model, recipe)
|
|
||||||
# create table if it doesn't exist
|
|
||||||
self.cursor = self.recipe.db.cursor()
|
|
||||||
self.cursor.row_factory = sqlite3.Row # type: ignore
|
|
||||||
self.cursor.execute(
|
|
||||||
f"CREATE TABLE IF NOT EXISTS {self.name} (uuid TEXT PRIMARY KEY, data JSON)"
|
|
||||||
)
|
|
||||||
|
|
||||||
def items(self) -> Iterable[tuple[str, BaseModel]]:
|
|
||||||
self.cursor.execute(f"SELECT uuid, data FROM {self.name}")
|
|
||||||
data = self.cursor.fetchall()
|
|
||||||
for item in data:
|
|
||||||
yield item["uuid"], self.model(**json.loads(item["data"]))
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
self.cursor.execute(f"SELECT COUNT(*) FROM {self.name}")
|
|
||||||
return self.cursor.fetchone()[0]
|
|
||||||
|
|
||||||
def add_item(self, item: BaseModel, id: str | None = None) -> None:
|
|
||||||
if id is None:
|
|
||||||
id = str(uuid.uuid1())
|
|
||||||
print("UUID", id, item)
|
|
||||||
self.cursor.execute(
|
|
||||||
f"INSERT INTO {self.name} (uuid, data) VALUES (?, ?)",
|
|
||||||
(id, item.model_dump_json()),
|
|
||||||
)
|
|
||||||
self.recipe.db.commit()
|
|
||||||
|
|
||||||
def reset(self) -> None:
|
|
||||||
self.cursor.execute(f"DELETE FROM {self.name}")
|
|
||||||
self.recipe.db.commit()
|
|
@ -1,77 +0,0 @@
|
|||||||
import importlib
|
|
||||||
from types import SimpleNamespace
|
|
||||||
import typer
|
|
||||||
import sys
|
|
||||||
from pprint import pprint
|
|
||||||
from typing import List, Optional
|
|
||||||
from typing_extensions import Annotated
|
|
||||||
|
|
||||||
from beakers.beakers import SqliteBeaker
|
|
||||||
|
|
||||||
app = typer.Typer()
|
|
||||||
|
|
||||||
|
|
||||||
def _load_recipe(dotted_path: str) -> SimpleNamespace:
|
|
||||||
sys.path.append(".")
|
|
||||||
path, name = dotted_path.rsplit(".", 1)
|
|
||||||
mod = importlib.import_module(path)
|
|
||||||
return getattr(mod, name)
|
|
||||||
|
|
||||||
|
|
||||||
@app.callback()
|
|
||||||
def main(
|
|
||||||
ctx: typer.Context,
|
|
||||||
recipe: str = typer.Option(None, envvar="BEAKER_RECIPE"),
|
|
||||||
) -> None:
|
|
||||||
if not recipe:
|
|
||||||
typer.secho(
|
|
||||||
"Missing recipe; pass --recipe or set env[BEAKER_RECIPE]",
|
|
||||||
fg=typer.colors.RED,
|
|
||||||
)
|
|
||||||
raise typer.Exit(1)
|
|
||||||
ctx.obj = _load_recipe(recipe)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def reset(ctx: typer.Context) -> None:
|
|
||||||
for beaker in ctx.obj.beakers.values():
|
|
||||||
if isinstance(beaker, SqliteBeaker):
|
|
||||||
if bl := len(beaker):
|
|
||||||
beaker.reset()
|
|
||||||
typer.secho(f"{beaker.name} reset ({bl})", fg=typer.colors.RED)
|
|
||||||
else:
|
|
||||||
typer.secho(f"{beaker.name} empty", fg=typer.colors.GREEN)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def show(ctx: typer.Context) -> None:
|
|
||||||
ctx.obj.show()
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def graph(ctx: typer.Context) -> None:
|
|
||||||
pprint(ctx.obj.graph_data())
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def run(
|
|
||||||
ctx: typer.Context,
|
|
||||||
input: Annotated[Optional[List[str]], typer.Option(...)] = None,
|
|
||||||
start: Optional[str] = typer.Option(None),
|
|
||||||
end: Optional[str] = typer.Option(None),
|
|
||||||
) -> None:
|
|
||||||
if ctx.obj.seeds:
|
|
||||||
typer.secho("Seeding beakers", fg=typer.colors.GREEN)
|
|
||||||
ctx.obj.process_seeds()
|
|
||||||
has_data = any(ctx.obj.beakers.values())
|
|
||||||
if not input and not has_data:
|
|
||||||
typer.secho("No data; pass --input to seed beaker(s)", fg=typer.colors.RED)
|
|
||||||
raise typer.Exit(1)
|
|
||||||
for input_str in input or []:
|
|
||||||
beaker, filename = input_str.split("=")
|
|
||||||
ctx.obj.csv_to_beaker(filename, beaker)
|
|
||||||
ctx.obj.run_once(start, end)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app()
|
|
@ -1,41 +0,0 @@
|
|||||||
import httpx
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
import datetime
|
|
||||||
|
|
||||||
|
|
||||||
class HttpResponse(BaseModel):
|
|
||||||
"""
|
|
||||||
Beaker data type that represents an HTTP response.
|
|
||||||
"""
|
|
||||||
|
|
||||||
url: str
|
|
||||||
status_code: int
|
|
||||||
response_body: str
|
|
||||||
retrieved_at: datetime.datetime = Field(default_factory=datetime.datetime.now)
|
|
||||||
|
|
||||||
|
|
||||||
class HttpRequest:
|
|
||||||
"""
|
|
||||||
Filter that converts from a beaker with a URL to a beaker with an HTTP response.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, beaker: str, field: str):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
beaker: The name of the beaker that contains the URL.
|
|
||||||
field: The name of the field in the beaker that contains the URL.
|
|
||||||
"""
|
|
||||||
self.beaker = beaker
|
|
||||||
self.field = field
|
|
||||||
|
|
||||||
async def __call__(self, item: BaseModel) -> HttpResponse:
|
|
||||||
url = getattr(item, self.field)
|
|
||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
response = await client.get(url)
|
|
||||||
|
|
||||||
return HttpResponse(
|
|
||||||
url=url,
|
|
||||||
status_code=response.status_code,
|
|
||||||
response_body=response.text,
|
|
||||||
)
|
|
@ -1,319 +0,0 @@
|
|||||||
import csv
|
|
||||||
import json
|
|
||||||
import typer
|
|
||||||
import inspect
|
|
||||||
import sqlite3
|
|
||||||
import hashlib
|
|
||||||
import asyncio
|
|
||||||
import networkx # type: ignore
|
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from typing import Iterable, Callable, Type
|
|
||||||
from pydantic import BaseModel, ConfigDict
|
|
||||||
from structlog import get_logger
|
|
||||||
|
|
||||||
from .beakers import Beaker, SqliteBeaker, TempBeaker
|
|
||||||
|
|
||||||
log = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
def get_sha512(filename: str) -> str:
|
|
||||||
with open(filename, "rb") as file:
|
|
||||||
return hashlib.sha512(file.read()).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
class Transform(BaseModel):
|
|
||||||
model_config = ConfigDict(frozen=True)
|
|
||||||
|
|
||||||
name: str
|
|
||||||
transform_func: Callable
|
|
||||||
error_map: dict[tuple, str]
|
|
||||||
|
|
||||||
|
|
||||||
class ErrorType(BaseModel):
|
|
||||||
item: BaseModel
|
|
||||||
exception: str
|
|
||||||
exc_type: str
|
|
||||||
|
|
||||||
|
|
||||||
def if_cond_true(data_cond_tup: tuple[dict, bool]) -> dict | None:
|
|
||||||
return data_cond_tup[0] if data_cond_tup[1] else None
|
|
||||||
|
|
||||||
|
|
||||||
def if_cond_false(data_cond_tup: tuple[dict, bool]) -> dict | None:
|
|
||||||
return data_cond_tup[0] if not data_cond_tup[1] else None
|
|
||||||
|
|
||||||
|
|
||||||
class Recipe:
|
|
||||||
def __init__(self, name: str, db_name: str = "beakers.db"):
|
|
||||||
self.name = name
|
|
||||||
self.graph = networkx.DiGraph()
|
|
||||||
self.beakers: dict[str, Beaker] = {}
|
|
||||||
self.seeds: defaultdict[str, list[Iterable[BaseModel]]] = defaultdict(list)
|
|
||||||
self.db = sqlite3.connect(db_name)
|
|
||||||
cursor = self.db.cursor()
|
|
||||||
cursor.execute(
|
|
||||||
"CREATE TABLE IF NOT EXISTS _metadata (table_name TEXT PRIMARY KEY, data JSON)"
|
|
||||||
)
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"Recipe({self.name})"
|
|
||||||
|
|
||||||
def add_beaker(
|
|
||||||
self,
|
|
||||||
name: str,
|
|
||||||
datatype: Type[BaseModel],
|
|
||||||
beaker_type: Type[Beaker] = SqliteBeaker,
|
|
||||||
) -> Beaker:
|
|
||||||
self.graph.add_node(name, datatype=datatype)
|
|
||||||
if datatype is None:
|
|
||||||
self.beakers[name] = TempBeaker(name, datatype, self)
|
|
||||||
else:
|
|
||||||
self.beakers[name] = SqliteBeaker(name, datatype, self)
|
|
||||||
return self.beakers[name]
|
|
||||||
|
|
||||||
def add_transform(
|
|
||||||
self,
|
|
||||||
from_beaker: str,
|
|
||||||
to_beaker: str,
|
|
||||||
transform_func: Callable,
|
|
||||||
*,
|
|
||||||
name: str | None = None,
|
|
||||||
error_map: dict[tuple, str] | None = None,
|
|
||||||
) -> None:
|
|
||||||
if name is None:
|
|
||||||
name = transform_func.__name__
|
|
||||||
if name == "<lambda>":
|
|
||||||
name = "λ"
|
|
||||||
transform = Transform(
|
|
||||||
name=name,
|
|
||||||
transform_func=transform_func,
|
|
||||||
error_map=error_map or {},
|
|
||||||
)
|
|
||||||
self.graph.add_edge(
|
|
||||||
from_beaker,
|
|
||||||
to_beaker,
|
|
||||||
transform=transform,
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_conditional(
|
|
||||||
self,
|
|
||||||
from_beaker: str,
|
|
||||||
condition_func: Callable,
|
|
||||||
if_true: str,
|
|
||||||
if_false: str = "",
|
|
||||||
) -> None:
|
|
||||||
# first add a transform to evaluate the conditional
|
|
||||||
if condition_func.__name__ == "<lambda>":
|
|
||||||
cond_name = f"cond-{from_beaker}"
|
|
||||||
else:
|
|
||||||
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
|
|
||||||
self.add_beaker(cond_name, None)
|
|
||||||
self.add_transform(
|
|
||||||
from_beaker,
|
|
||||||
cond_name,
|
|
||||||
lambda data: (data, condition_func(data)),
|
|
||||||
name=cond_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
# then add two filtered paths that remove the condition result
|
|
||||||
self.add_beaker(if_true, None)
|
|
||||||
self.add_transform(
|
|
||||||
cond_name,
|
|
||||||
if_true,
|
|
||||||
if_cond_true,
|
|
||||||
)
|
|
||||||
if if_false:
|
|
||||||
self.add_transform(
|
|
||||||
cond_name,
|
|
||||||
if_false,
|
|
||||||
if_cond_false,
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_seed(self, beaker_name: str, data: Iterable[BaseModel]) -> None:
|
|
||||||
self.seeds[beaker_name].append(data)
|
|
||||||
|
|
||||||
def process_seeds(self) -> None:
|
|
||||||
log.info("process_seeds", recipe=self.name)
|
|
||||||
for beaker_name, seeds in self.seeds.items():
|
|
||||||
for seed in seeds:
|
|
||||||
self.beakers[beaker_name].add_items(seed)
|
|
||||||
|
|
||||||
def get_metadata(self, table_name: str) -> dict:
|
|
||||||
cursor = self.db.cursor()
|
|
||||||
cursor.execute(
|
|
||||||
"SELECT data FROM _metadata WHERE table_name = ?",
|
|
||||||
(table_name,),
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
data = cursor.fetchone()["data"]
|
|
||||||
log.debug("get_metadata", table_name=table_name, data=data)
|
|
||||||
return json.loads(data)
|
|
||||||
except TypeError:
|
|
||||||
log.debug("get_metadata", table_name=table_name, data={})
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def save_metadata(self, table_name: str, data: dict) -> None:
|
|
||||||
data_json = json.dumps(data)
|
|
||||||
log.info("save_metadata", table_name=table_name, data=data_json)
|
|
||||||
# sqlite upsert
|
|
||||||
cursor = self.db.cursor()
|
|
||||||
cursor.execute(
|
|
||||||
"INSERT INTO _metadata (table_name, data) VALUES (?, ?) ON CONFLICT(table_name) DO UPDATE SET data = ?",
|
|
||||||
(table_name, data_json, data_json),
|
|
||||||
)
|
|
||||||
self.db.commit()
|
|
||||||
|
|
||||||
def csv_to_beaker(self, filename: str, beaker_name: str) -> None:
|
|
||||||
beaker = self.beakers[beaker_name]
|
|
||||||
lg = log.bind(beaker=beaker, filename=filename)
|
|
||||||
# three cases: empty, match, mismatch
|
|
||||||
# case 1: empty
|
|
||||||
if len(beaker) == 0:
|
|
||||||
with open(filename, "r") as file:
|
|
||||||
reader = csv.DictReader(file)
|
|
||||||
added = 0
|
|
||||||
for row in reader:
|
|
||||||
beaker.add_item(beaker.model(**row))
|
|
||||||
added += 1
|
|
||||||
lg.info("from_csv", case="empty", added=added)
|
|
||||||
meta = self.get_metadata(beaker.name)
|
|
||||||
meta["sha512"] = get_sha512(filename)
|
|
||||||
self.save_metadata(beaker.name, meta)
|
|
||||||
else:
|
|
||||||
old_sha = self.get_metadata(beaker.name).get("sha512")
|
|
||||||
new_sha = get_sha512(filename)
|
|
||||||
if old_sha != new_sha:
|
|
||||||
# case 3: mismatch
|
|
||||||
lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha)
|
|
||||||
raise Exception("sha512 mismatch")
|
|
||||||
else:
|
|
||||||
# case 2: match
|
|
||||||
lg.info("from_csv", case="match")
|
|
||||||
|
|
||||||
def show(self) -> None:
|
|
||||||
seed_count = Counter(self.seeds.keys())
|
|
||||||
typer.secho("Seeds", fg=typer.colors.GREEN)
|
|
||||||
for beaker, count in seed_count.items():
|
|
||||||
typer.secho(f" {beaker} ({count})", fg=typer.colors.GREEN)
|
|
||||||
graph_data = self.graph_data()
|
|
||||||
for node in graph_data:
|
|
||||||
if node["temp"]:
|
|
||||||
typer.secho(node["name"], fg=typer.colors.CYAN)
|
|
||||||
else:
|
|
||||||
typer.secho(
|
|
||||||
f"{node['name']} ({node['len']})",
|
|
||||||
fg=typer.colors.GREEN if node["len"] else typer.colors.YELLOW,
|
|
||||||
)
|
|
||||||
for edge in node["edges"]:
|
|
||||||
print(f" -({edge['transform'].name})-> {edge['to_beaker']}")
|
|
||||||
for k, v in edge["transform"].error_map.items():
|
|
||||||
if isinstance(k, tuple):
|
|
||||||
typer.secho(
|
|
||||||
f" {' '.join(c.__name__ for c in k)} -> {v}",
|
|
||||||
fg=typer.colors.RED,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
typer.secho(f" {k.__name__} -> {v}", fg=typer.colors.RED)
|
|
||||||
|
|
||||||
def graph_data(self) -> list[dict]:
|
|
||||||
nodes = {}
|
|
||||||
|
|
||||||
for node in networkx.topological_sort(self.graph):
|
|
||||||
beaker = self.beakers[node]
|
|
||||||
temp = isinstance(beaker, TempBeaker)
|
|
||||||
|
|
||||||
nodes[node] = {
|
|
||||||
"name": node,
|
|
||||||
"temp": temp,
|
|
||||||
"len": len(beaker),
|
|
||||||
"edges": [],
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = 0
|
|
||||||
for from_b, to_b, edge in self.graph.in_edges(node, data=True):
|
|
||||||
if nodes[from_b]["rank"] > rank:
|
|
||||||
rank = nodes[from_b]["rank"]
|
|
||||||
nodes[node]["rank"] = rank + 1
|
|
||||||
|
|
||||||
for from_b, to_b, edge in self.graph.out_edges(node, data=True):
|
|
||||||
edge["to_beaker"] = to_b
|
|
||||||
nodes[node]["edges"].append(edge)
|
|
||||||
|
|
||||||
# all data collected for display
|
|
||||||
return sorted(nodes.values(), key=lambda x: (x["rank"], x["name"]))
|
|
||||||
|
|
||||||
def run_once(
|
|
||||||
self, start_beaker: str | None = None, end_beaker: str | None = None
|
|
||||||
) -> None:
|
|
||||||
log.info("run_once", recipe=self)
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
|
|
||||||
started = False if start_beaker else True
|
|
||||||
|
|
||||||
# go through each node in forward order, pushing data
|
|
||||||
for node in networkx.topological_sort(self.graph):
|
|
||||||
# only process nodes between start and end
|
|
||||||
if not started:
|
|
||||||
if node == start_beaker:
|
|
||||||
started = True
|
|
||||||
log.info("partial run start", node=node)
|
|
||||||
else:
|
|
||||||
log.info("partial run skip", node=node, waiting_for=start_beaker)
|
|
||||||
continue
|
|
||||||
if end_beaker and node == end_beaker:
|
|
||||||
log.info("partial run end", node=node)
|
|
||||||
break
|
|
||||||
|
|
||||||
# get outbound edges
|
|
||||||
edges = self.graph.out_edges(node, data=True)
|
|
||||||
|
|
||||||
for from_b, to_b, edge in edges:
|
|
||||||
transform = edge["transform"]
|
|
||||||
|
|
||||||
from_beaker = self.beakers[from_b]
|
|
||||||
to_beaker = self.beakers[to_b]
|
|
||||||
already_processed = from_beaker.id_set() & to_beaker.id_set()
|
|
||||||
|
|
||||||
log.info(
|
|
||||||
"transform",
|
|
||||||
from_b=from_b,
|
|
||||||
to_b=to_b,
|
|
||||||
to_process=len(from_beaker) - len(already_processed),
|
|
||||||
already_processed=len(already_processed),
|
|
||||||
transform=edge["transform"].name,
|
|
||||||
)
|
|
||||||
|
|
||||||
# convert coroutine to function
|
|
||||||
if inspect.iscoroutinefunction(transform.transform_func):
|
|
||||||
t_func = lambda x: loop.run_until_complete(
|
|
||||||
transform.transform_func(x)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
t_func = transform.transform_func
|
|
||||||
|
|
||||||
for id, item in from_beaker.items():
|
|
||||||
if id in already_processed:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
transformed = t_func(item)
|
|
||||||
if transformed:
|
|
||||||
to_beaker.add_item(transformed, id)
|
|
||||||
except Exception as e:
|
|
||||||
for (
|
|
||||||
error_types,
|
|
||||||
error_beaker_name,
|
|
||||||
) in transform.error_map.items():
|
|
||||||
if isinstance(e, error_types):
|
|
||||||
error_beaker = self.beakers[error_beaker_name]
|
|
||||||
error_beaker.add_item(
|
|
||||||
ErrorType(
|
|
||||||
item=item,
|
|
||||||
exception=str(e),
|
|
||||||
exc_type=str(type(e)),
|
|
||||||
),
|
|
||||||
id,
|
|
||||||
)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
# no error handler, re-raise
|
|
||||||
raise
|
|
@ -1,23 +0,0 @@
|
|||||||
import uuid
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
|
||||||
class Record:
|
|
||||||
_reserved_names = ("id",)
|
|
||||||
|
|
||||||
def __init__(self, id: str | None = None):
|
|
||||||
self._id = id if id else str(uuid.uuid1())
|
|
||||||
self._data: dict[str, BaseModel] = {}
|
|
||||||
|
|
||||||
def __getattr__(self, name: str) -> str | BaseModel:
|
|
||||||
if name == "id":
|
|
||||||
return self._id
|
|
||||||
return self._data[name]
|
|
||||||
|
|
||||||
def __setattr__(self, name: str, value: BaseModel) -> None:
|
|
||||||
if name.startswith("_"):
|
|
||||||
super().__setattr__(name, value)
|
|
||||||
elif name not in self._data and name not in self._reserved_names:
|
|
||||||
self._data[name] = value
|
|
||||||
else:
|
|
||||||
raise AttributeError(f"DataObject attribute {name} already exists")
|
|
@ -1,34 +0,0 @@
|
|||||||
from beakers.record import Record
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_record_id_autogen():
|
|
||||||
r = Record()
|
|
||||||
assert len(r.id) == 36
|
|
||||||
r2, r3 = Record(), Record()
|
|
||||||
assert r2.id != r3.id
|
|
||||||
|
|
||||||
|
|
||||||
def test_record_id_assign():
|
|
||||||
r = Record(id="test")
|
|
||||||
assert r.id == "test"
|
|
||||||
|
|
||||||
|
|
||||||
def test_record_setattr_good():
|
|
||||||
r = Record()
|
|
||||||
r.attrib = "set"
|
|
||||||
assert r.attrib == "set"
|
|
||||||
|
|
||||||
|
|
||||||
def test_record_setattr_duplicate():
|
|
||||||
r = Record()
|
|
||||||
r.attrib = "set"
|
|
||||||
with pytest.raises(AttributeError):
|
|
||||||
r.attrib = "changed"
|
|
||||||
assert r.attrib == "set"
|
|
||||||
|
|
||||||
|
|
||||||
def test_record_setattr_id():
|
|
||||||
r = Record()
|
|
||||||
with pytest.raises(AttributeError):
|
|
||||||
r.id = "changed"
|
|
Loading…
Reference in New Issue
Block a user