databeakers
This commit is contained in:
		
							parent
							
								
									c21954ebba
								
							
						
					
					
						commit
						b357a6d1d5
					
				
					 10 changed files with 127 additions and 789 deletions
				
			
		|  | @ -1,61 +0,0 @@ | |||
| from beakers.recipe import Recipe | ||||
| import pydantic | ||||
| 
 | ||||
| 
 | ||||
| class Word(pydantic.BaseModel): | ||||
|     word: str | ||||
| 
 | ||||
| 
 | ||||
| class ClassifiedWord(pydantic.BaseModel): | ||||
|     normalized_word: str | ||||
|     is_fruit: bool | ||||
| 
 | ||||
| 
 | ||||
| class Sentence(pydantic.BaseModel): | ||||
|     sentence: list[str] | ||||
| 
 | ||||
| 
 | ||||
| def word_classifier(item) -> ClassifiedWord: | ||||
|     return ClassifiedWord( | ||||
|         normalized_word=item.word.lower(), | ||||
|         is_fruit=item.word.lower() | ||||
|         in ( | ||||
|             "apple", | ||||
|             "banana", | ||||
|             "fig", | ||||
|             "grape", | ||||
|             "lemon", | ||||
|             "mango", | ||||
|             "orange", | ||||
|             "pear", | ||||
|             "raspberry", | ||||
|         ), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| recipe = Recipe("fruits-example") | ||||
| recipe.add_beaker("word", Word) | ||||
| recipe.add_beaker("classified_word", ClassifiedWord) | ||||
| recipe.add_beaker("sentence", Sentence) | ||||
| recipe.add_transform("word", "classified_word", word_classifier) | ||||
| recipe.add_conditional( | ||||
|     "classified_word", | ||||
|     lambda cw: cw.is_fruit, | ||||
|     "fruits", | ||||
| ) | ||||
| recipe.add_transform( | ||||
|     "fruits", | ||||
|     "sentence", | ||||
|     lambda x: Sentence(sentence=f"I love a fresh {x.normalized_word}".split()), | ||||
| ) | ||||
| 
 | ||||
| recipe.add_seed( | ||||
|     "word", | ||||
|     [ | ||||
|         Word(word="apple"), | ||||
|         Word(word="bAnAnA"), | ||||
|         Word(word="hammer"), | ||||
|         Word(word="orange"), | ||||
|         Word(word="EGG"), | ||||
|     ], | ||||
| ) | ||||
							
								
								
									
										245
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										245
									
								
								poetry.lock
									
									
									
										generated
									
									
									
								
							|  | @ -1,10 +1,9 @@ | |||
| # This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. | ||||
| # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. | ||||
| 
 | ||||
| [[package]] | ||||
| name = "annotated-types" | ||||
| version = "0.5.0" | ||||
| description = "Reusable constraint types to use with typing.Annotated" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -16,7 +15,6 @@ files = [ | |||
| name = "anyio" | ||||
| version = "3.7.1" | ||||
| description = "High level compatibility layer for multiple asynchronous event loop implementations" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -37,7 +35,6 @@ trio = ["trio (<0.22)"] | |||
| name = "certifi" | ||||
| version = "2023.5.7" | ||||
| description = "Python package for providing Mozilla's CA Bundle." | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.6" | ||||
| files = [ | ||||
|  | @ -49,7 +46,6 @@ files = [ | |||
| name = "charset-normalizer" | ||||
| version = "3.2.0" | ||||
| description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7.0" | ||||
| files = [ | ||||
|  | @ -130,11 +126,24 @@ files = [ | |||
|     {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "click" | ||||
| version = "8.1.6" | ||||
| description = "Composable command line interface toolkit" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "click-8.1.6-py3-none-any.whl", hash = "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5"}, | ||||
|     {file = "click-8.1.6.tar.gz", hash = "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd"}, | ||||
| ] | ||||
| 
 | ||||
| [package.dependencies] | ||||
| colorama = {version = "*", markers = "platform_system == \"Windows\""} | ||||
| 
 | ||||
| [[package]] | ||||
| name = "colorama" | ||||
| version = "0.4.6" | ||||
| description = "Cross-platform colored terminal text." | ||||
| category = "dev" | ||||
| optional = false | ||||
| python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" | ||||
| files = [ | ||||
|  | @ -142,11 +151,32 @@ files = [ | |||
|     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "databeakers" | ||||
| version = "0.1.0" | ||||
| description = "" | ||||
| optional = false | ||||
| python-versions = "^3.10" | ||||
| files = [] | ||||
| develop = true | ||||
| 
 | ||||
| [package.dependencies] | ||||
| httpx = "^0.24.0" | ||||
| networkx = "^3.1" | ||||
| pydantic = "^2.0.2" | ||||
| rich = "^13.4.2" | ||||
| scrapelib = "^2.1.0" | ||||
| structlog = "^23.1.0" | ||||
| typer = "^0.9.0" | ||||
| 
 | ||||
| [package.source] | ||||
| type = "directory" | ||||
| url = "../beakers" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "h11" | ||||
| version = "0.14.0" | ||||
| description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -158,7 +188,6 @@ files = [ | |||
| name = "httpcore" | ||||
| version = "0.17.3" | ||||
| description = "A minimal low-level HTTP client." | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -170,17 +199,16 @@ files = [ | |||
| anyio = ">=3.0,<5.0" | ||||
| certifi = "*" | ||||
| h11 = ">=0.13,<0.15" | ||||
| sniffio = ">=1.0.0,<2.0.0" | ||||
| sniffio = "==1.*" | ||||
| 
 | ||||
| [package.extras] | ||||
| http2 = ["h2 (>=3,<5)"] | ||||
| socks = ["socksio (>=1.0.0,<2.0.0)"] | ||||
| socks = ["socksio (==1.*)"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "httpx" | ||||
| version = "0.24.1" | ||||
| description = "The next generation HTTP client." | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -196,15 +224,14 @@ sniffio = "*" | |||
| 
 | ||||
| [package.extras] | ||||
| brotli = ["brotli", "brotlicffi"] | ||||
| cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] | ||||
| cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] | ||||
| http2 = ["h2 (>=3,<5)"] | ||||
| socks = ["socksio (>=1.0.0,<2.0.0)"] | ||||
| socks = ["socksio (==1.*)"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "idna" | ||||
| version = "3.4" | ||||
| description = "Internationalized Domain Names in Applications (IDNA)" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.5" | ||||
| files = [ | ||||
|  | @ -213,80 +240,44 @@ files = [ | |||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "iniconfig" | ||||
| version = "2.0.0" | ||||
| description = "brain-dead simple config-ini parsing" | ||||
| category = "dev" | ||||
| name = "markdown-it-py" | ||||
| version = "3.0.0" | ||||
| description = "Python port of markdown-it. Markdown parsing, done right!" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| python-versions = ">=3.8" | ||||
| files = [ | ||||
|     {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, | ||||
|     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "mypy" | ||||
| version = "1.4.1" | ||||
| description = "Optional static typing for Python" | ||||
| category = "dev" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "mypy-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:566e72b0cd6598503e48ea610e0052d1b8168e60a46e0bfd34b3acf2d57f96a8"}, | ||||
|     {file = "mypy-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ca637024ca67ab24a7fd6f65d280572c3794665eaf5edcc7e90a866544076878"}, | ||||
|     {file = "mypy-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dde1d180cd84f0624c5dcaaa89c89775550a675aff96b5848de78fb11adabcd"}, | ||||
|     {file = "mypy-1.4.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8c4d8e89aa7de683e2056a581ce63c46a0c41e31bd2b6d34144e2c80f5ea53dc"}, | ||||
|     {file = "mypy-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:bfdca17c36ae01a21274a3c387a63aa1aafe72bff976522886869ef131b937f1"}, | ||||
|     {file = "mypy-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7549fbf655e5825d787bbc9ecf6028731973f78088fbca3a1f4145c39ef09462"}, | ||||
|     {file = "mypy-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98324ec3ecf12296e6422939e54763faedbfcc502ea4a4c38502082711867258"}, | ||||
|     {file = "mypy-1.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:141dedfdbfe8a04142881ff30ce6e6653c9685b354876b12e4fe6c78598b45e2"}, | ||||
|     {file = "mypy-1.4.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:8207b7105829eca6f3d774f64a904190bb2231de91b8b186d21ffd98005f14a7"}, | ||||
|     {file = "mypy-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:16f0db5b641ba159eff72cff08edc3875f2b62b2fa2bc24f68c1e7a4e8232d01"}, | ||||
|     {file = "mypy-1.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:470c969bb3f9a9efcedbadcd19a74ffb34a25f8e6b0e02dae7c0e71f8372f97b"}, | ||||
|     {file = "mypy-1.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5952d2d18b79f7dc25e62e014fe5a23eb1a3d2bc66318df8988a01b1a037c5b"}, | ||||
|     {file = "mypy-1.4.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:190b6bab0302cec4e9e6767d3eb66085aef2a1cc98fe04936d8a42ed2ba77bb7"}, | ||||
|     {file = "mypy-1.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:9d40652cc4fe33871ad3338581dca3297ff5f2213d0df345bcfbde5162abf0c9"}, | ||||
|     {file = "mypy-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01fd2e9f85622d981fd9063bfaef1aed6e336eaacca00892cd2d82801ab7c042"}, | ||||
|     {file = "mypy-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2460a58faeea905aeb1b9b36f5065f2dc9a9c6e4c992a6499a2360c6c74ceca3"}, | ||||
|     {file = "mypy-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a2746d69a8196698146a3dbe29104f9eb6a2a4d8a27878d92169a6c0b74435b6"}, | ||||
|     {file = "mypy-1.4.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ae704dcfaa180ff7c4cfbad23e74321a2b774f92ca77fd94ce1049175a21c97f"}, | ||||
|     {file = "mypy-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:43d24f6437925ce50139a310a64b2ab048cb2d3694c84c71c3f2a1626d8101dc"}, | ||||
|     {file = "mypy-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c482e1246726616088532b5e964e39765b6d1520791348e6c9dc3af25b233828"}, | ||||
|     {file = "mypy-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:43b592511672017f5b1a483527fd2684347fdffc041c9ef53428c8dc530f79a3"}, | ||||
|     {file = "mypy-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34a9239d5b3502c17f07fd7c0b2ae6b7dd7d7f6af35fbb5072c6208e76295816"}, | ||||
|     {file = "mypy-1.4.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5703097c4936bbb9e9bce41478c8d08edd2865e177dc4c52be759f81ee4dd26c"}, | ||||
|     {file = "mypy-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:e02d700ec8d9b1859790c0475df4e4092c7bf3272a4fd2c9f33d87fac4427b8f"}, | ||||
|     {file = "mypy-1.4.1-py3-none-any.whl", hash = "sha256:45d32cec14e7b97af848bddd97d85ea4f0db4d5a149ed9676caa4eb2f7402bb4"}, | ||||
|     {file = "mypy-1.4.1.tar.gz", hash = "sha256:9bbcd9ab8ea1f2e1c8031c21445b511442cc45c89951e49bbf852cbb70755b1b"}, | ||||
|     {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, | ||||
|     {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, | ||||
| ] | ||||
| 
 | ||||
| [package.dependencies] | ||||
| mypy-extensions = ">=1.0.0" | ||||
| typing-extensions = ">=4.1.0" | ||||
| mdurl = ">=0.1,<1.0" | ||||
| 
 | ||||
| [package.extras] | ||||
| dmypy = ["psutil (>=4.0)"] | ||||
| install-types = ["pip"] | ||||
| python2 = ["typed-ast (>=1.4.0,<2)"] | ||||
| reports = ["lxml"] | ||||
| benchmarking = ["psutil", "pytest", "pytest-benchmark"] | ||||
| code-style = ["pre-commit (>=3.0,<4.0)"] | ||||
| compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] | ||||
| linkify = ["linkify-it-py (>=1,<3)"] | ||||
| plugins = ["mdit-py-plugins"] | ||||
| profiling = ["gprof2dot"] | ||||
| rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] | ||||
| testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "mypy-extensions" | ||||
| version = "1.0.0" | ||||
| description = "Type system extensions for programs checked with the mypy type checker." | ||||
| category = "dev" | ||||
| name = "mdurl" | ||||
| version = "0.1.2" | ||||
| description = "Markdown URL utilities" | ||||
| optional = false | ||||
| python-versions = ">=3.5" | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, | ||||
|     {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, | ||||
|     {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, | ||||
|     {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "networkx" | ||||
| version = "3.1" | ||||
| description = "Python package for creating and manipulating graphs and networks" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.8" | ||||
| files = [ | ||||
|  | @ -301,39 +292,10 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx- | |||
| extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] | ||||
| test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "packaging" | ||||
| version = "23.1" | ||||
| description = "Core utilities for Python packages" | ||||
| category = "dev" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, | ||||
|     {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "pluggy" | ||||
| version = "1.2.0" | ||||
| description = "plugin and hook calling mechanisms for python" | ||||
| category = "dev" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, | ||||
|     {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, | ||||
| ] | ||||
| 
 | ||||
| [package.extras] | ||||
| dev = ["pre-commit", "tox"] | ||||
| testing = ["pytest", "pytest-benchmark"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "pydantic" | ||||
| version = "2.0.2" | ||||
| description = "Data validation using Python type hints" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -353,7 +315,6 @@ email = ["email-validator (>=2.0.0)"] | |||
| name = "pydantic-core" | ||||
| version = "2.1.2" | ||||
| description = "" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -464,31 +425,23 @@ files = [ | |||
| typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" | ||||
| 
 | ||||
| [[package]] | ||||
| name = "pytest" | ||||
| version = "7.4.0" | ||||
| description = "pytest: simple powerful testing with Python" | ||||
| category = "dev" | ||||
| name = "pygments" | ||||
| version = "2.15.1" | ||||
| description = "Pygments is a syntax highlighting package written in Python." | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, | ||||
|     {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, | ||||
|     {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, | ||||
|     {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, | ||||
| ] | ||||
| 
 | ||||
| [package.dependencies] | ||||
| colorama = {version = "*", markers = "sys_platform == \"win32\""} | ||||
| iniconfig = "*" | ||||
| packaging = "*" | ||||
| pluggy = ">=0.12,<2.0" | ||||
| 
 | ||||
| [package.extras] | ||||
| testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] | ||||
| plugins = ["importlib-metadata"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "requests" | ||||
| version = "2.31.0" | ||||
| description = "Python HTTP for Humans." | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -506,11 +459,28 @@ urllib3 = ">=1.21.1,<3" | |||
| socks = ["PySocks (>=1.5.6,!=1.5.7)"] | ||||
| use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "rich" | ||||
| version = "13.5.2" | ||||
| description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" | ||||
| optional = false | ||||
| python-versions = ">=3.7.0" | ||||
| files = [ | ||||
|     {file = "rich-13.5.2-py3-none-any.whl", hash = "sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808"}, | ||||
|     {file = "rich-13.5.2.tar.gz", hash = "sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39"}, | ||||
| ] | ||||
| 
 | ||||
| [package.dependencies] | ||||
| markdown-it-py = ">=2.2.0" | ||||
| pygments = ">=2.13.0,<3.0.0" | ||||
| 
 | ||||
| [package.extras] | ||||
| jupyter = ["ipywidgets (>=7.5.1,<9)"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "scrapelib" | ||||
| version = "2.2.0" | ||||
| description = "" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7,<4.0" | ||||
| files = [ | ||||
|  | @ -526,7 +496,6 @@ urllib3 = ">=1.26,<2.0" | |||
| name = "sniffio" | ||||
| version = "1.3.0" | ||||
| description = "Sniff out which async library your code is running under" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -534,11 +503,48 @@ files = [ | |||
|     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, | ||||
| ] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "structlog" | ||||
| version = "23.1.0" | ||||
| description = "Structured Logging for Python" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|     {file = "structlog-23.1.0-py3-none-any.whl", hash = "sha256:79b9e68e48b54e373441e130fa447944e6f87a05b35de23138e475c05d0f7e0e"}, | ||||
|     {file = "structlog-23.1.0.tar.gz", hash = "sha256:270d681dd7d163c11ba500bc914b2472d2b50a8ef00faa999ded5ff83a2f906b"}, | ||||
| ] | ||||
| 
 | ||||
| [package.extras] | ||||
| dev = ["structlog[docs,tests,typing]"] | ||||
| docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-mermaid", "twisted"] | ||||
| tests = ["coverage[toml]", "freezegun (>=0.2.8)", "pretend", "pytest (>=6.0)", "pytest-asyncio (>=0.17)", "simplejson"] | ||||
| typing = ["mypy", "rich", "twisted"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "typer" | ||||
| version = "0.9.0" | ||||
| description = "Typer, build great CLIs. Easy to code. Based on Python type hints." | ||||
| optional = false | ||||
| python-versions = ">=3.6" | ||||
| files = [ | ||||
|     {file = "typer-0.9.0-py3-none-any.whl", hash = "sha256:5d96d986a21493606a358cae4461bd8cdf83cbf33a5aa950ae629ca3b51467ee"}, | ||||
|     {file = "typer-0.9.0.tar.gz", hash = "sha256:50922fd79aea2f4751a8e0408ff10d2662bd0c8bbfa84755a699f3bada2978b2"}, | ||||
| ] | ||||
| 
 | ||||
| [package.dependencies] | ||||
| click = ">=7.1.1,<9.0.0" | ||||
| typing-extensions = ">=3.7.4.3" | ||||
| 
 | ||||
| [package.extras] | ||||
| all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] | ||||
| dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] | ||||
| doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] | ||||
| test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] | ||||
| 
 | ||||
| [[package]] | ||||
| name = "typing-extensions" | ||||
| version = "4.7.1" | ||||
| description = "Backported and Experimental Type Hints for Python 3.7+" | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=3.7" | ||||
| files = [ | ||||
|  | @ -550,7 +556,6 @@ files = [ | |||
| name = "urllib3" | ||||
| version = "1.26.16" | ||||
| description = "HTTP library with thread-safe connection pooling, file post, and more." | ||||
| category = "main" | ||||
| optional = false | ||||
| python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" | ||||
| files = [ | ||||
|  | @ -566,4 +571,4 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] | |||
| [metadata] | ||||
| lock-version = "2.0" | ||||
| python-versions = "^3.11" | ||||
| content-hash = "136157484750ab7b80e7896a5dc2905b4f9c7ed20b47dadba4e6ae45f0f6c289" | ||||
| content-hash = "a121f92ed7a60a9138f1e19bbf88220cd0a5b25c0287f031e36cec0a79188530" | ||||
|  |  | |||
|  | @ -1,27 +1,15 @@ | |||
| [tool.poetry] | ||||
| name = "beakers" | ||||
| name = "foiaghost" | ||||
| version = "0.1.0" | ||||
| description = "" | ||||
| authors = ["James Turk <dev@jamesturk.net>"] | ||||
| readme = "README.md" | ||||
| 
 | ||||
| [tool.poetry.scripts] | ||||
| bkr = 'beakers.cli:app' | ||||
| 
 | ||||
| 
 | ||||
| [tool.poetry.dependencies] | ||||
| python = "^3.11" | ||||
| #scrapeghost = {path = "../scrapeghost", develop = true} | ||||
| scrapelib = "^2.1.0" | ||||
| httpx = "^0.24.0" | ||||
| networkx = "^3.1" | ||||
| pydantic = "^2.0.2" | ||||
| databeakers = {path = "../beakers", develop = true} | ||||
| 
 | ||||
| 
 | ||||
| [tool.poetry.group.dev.dependencies] | ||||
| pytest = "^7.4.0" | ||||
| mypy = "^1.4.1" | ||||
| 
 | ||||
| [build-system] | ||||
| requires = ["poetry-core"] | ||||
| build-backend = "poetry.core.masonry.api" | ||||
|  |  | |||
|  | @ -1 +0,0 @@ | |||
| from .recipe import Recipe | ||||
|  | @ -1,99 +0,0 @@ | |||
| import abc | ||||
| import json | ||||
| import sqlite3 | ||||
| import uuid | ||||
| from pydantic import BaseModel | ||||
| from typing import Iterable, Type, TYPE_CHECKING | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from .recipe import Recipe | ||||
| 
 | ||||
| PydanticModel = Type[BaseModel] | ||||
| 
 | ||||
| 
 | ||||
| class Beaker(abc.ABC): | ||||
|     def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"): | ||||
|         self.name = name | ||||
|         self.model = model | ||||
|         self.recipe = recipe | ||||
| 
 | ||||
|     def __repr__(self) -> str: | ||||
|         return f"Beaker({self.name}, {self.model.__name__})" | ||||
| 
 | ||||
|     @abc.abstractmethod | ||||
|     def items(self) -> Iterable[tuple[str, BaseModel]]: | ||||
|         pass | ||||
| 
 | ||||
|     @abc.abstractmethod | ||||
|     def __len__(self) -> int: | ||||
|         pass | ||||
| 
 | ||||
|     @abc.abstractmethod | ||||
|     def add_item(self, item: BaseModel, id: str | None = None) -> None: | ||||
|         pass | ||||
| 
 | ||||
|     @abc.abstractmethod | ||||
|     def reset(self) -> None: | ||||
|         pass | ||||
| 
 | ||||
|     def add_items(self, items: Iterable[BaseModel]) -> None: | ||||
|         for item in items: | ||||
|             self.add_item(item) | ||||
| 
 | ||||
|     def id_set(self) -> set[str]: | ||||
|         return set(id for id, _ in self.items()) | ||||
| 
 | ||||
| 
 | ||||
| class TempBeaker(Beaker): | ||||
|     def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"): | ||||
|         super().__init__(name, model, recipe) | ||||
|         self._items: list[tuple[str, BaseModel]] = [] | ||||
| 
 | ||||
|     def __len__(self) -> int: | ||||
|         return len(self._items) | ||||
| 
 | ||||
|     def add_item(self, item: BaseModel, id: str | None = None) -> None: | ||||
|         if id is None: | ||||
|             id = str(uuid.uuid1()) | ||||
|         self._items.append((id, item)) | ||||
| 
 | ||||
|     def items(self) -> Iterable[tuple[str, BaseModel]]: | ||||
|         yield from self._items | ||||
| 
 | ||||
|     def reset(self) -> None: | ||||
|         self._items = [] | ||||
| 
 | ||||
| 
 | ||||
| class SqliteBeaker(Beaker): | ||||
|     def __init__(self, name: str, model: PydanticModel, recipe: "Recipe"): | ||||
|         super().__init__(name, model, recipe) | ||||
|         # create table if it doesn't exist | ||||
|         self.cursor = self.recipe.db.cursor() | ||||
|         self.cursor.row_factory = sqlite3.Row  # type: ignore | ||||
|         self.cursor.execute( | ||||
|             f"CREATE TABLE IF NOT EXISTS {self.name} (uuid TEXT PRIMARY KEY, data JSON)" | ||||
|         ) | ||||
| 
 | ||||
|     def items(self) -> Iterable[tuple[str, BaseModel]]: | ||||
|         self.cursor.execute(f"SELECT uuid, data FROM {self.name}") | ||||
|         data = self.cursor.fetchall() | ||||
|         for item in data: | ||||
|             yield item["uuid"], self.model(**json.loads(item["data"])) | ||||
| 
 | ||||
|     def __len__(self) -> int: | ||||
|         self.cursor.execute(f"SELECT COUNT(*) FROM {self.name}") | ||||
|         return self.cursor.fetchone()[0] | ||||
| 
 | ||||
|     def add_item(self, item: BaseModel, id: str | None = None) -> None: | ||||
|         if id is None: | ||||
|             id = str(uuid.uuid1()) | ||||
|             print("UUID", id, item) | ||||
|         self.cursor.execute( | ||||
|             f"INSERT INTO {self.name} (uuid, data) VALUES (?, ?)", | ||||
|             (id, item.model_dump_json()), | ||||
|         ) | ||||
|         self.recipe.db.commit() | ||||
| 
 | ||||
|     def reset(self) -> None: | ||||
|         self.cursor.execute(f"DELETE FROM {self.name}") | ||||
|         self.recipe.db.commit() | ||||
|  | @ -1,77 +0,0 @@ | |||
| import importlib | ||||
| from types import SimpleNamespace | ||||
| import typer | ||||
| import sys | ||||
| from pprint import pprint | ||||
| from typing import List, Optional | ||||
| from typing_extensions import Annotated | ||||
| 
 | ||||
| from beakers.beakers import SqliteBeaker | ||||
| 
 | ||||
| app = typer.Typer() | ||||
| 
 | ||||
| 
 | ||||
| def _load_recipe(dotted_path: str) -> SimpleNamespace: | ||||
|     sys.path.append(".") | ||||
|     path, name = dotted_path.rsplit(".", 1) | ||||
|     mod = importlib.import_module(path) | ||||
|     return getattr(mod, name) | ||||
| 
 | ||||
| 
 | ||||
| @app.callback() | ||||
| def main( | ||||
|     ctx: typer.Context, | ||||
|     recipe: str = typer.Option(None, envvar="BEAKER_RECIPE"), | ||||
| ) -> None: | ||||
|     if not recipe: | ||||
|         typer.secho( | ||||
|             "Missing recipe; pass --recipe or set env[BEAKER_RECIPE]", | ||||
|             fg=typer.colors.RED, | ||||
|         ) | ||||
|         raise typer.Exit(1) | ||||
|     ctx.obj = _load_recipe(recipe) | ||||
| 
 | ||||
| 
 | ||||
| @app.command() | ||||
| def reset(ctx: typer.Context) -> None: | ||||
|     for beaker in ctx.obj.beakers.values(): | ||||
|         if isinstance(beaker, SqliteBeaker): | ||||
|             if bl := len(beaker): | ||||
|                 beaker.reset() | ||||
|                 typer.secho(f"{beaker.name} reset ({bl})", fg=typer.colors.RED) | ||||
|             else: | ||||
|                 typer.secho(f"{beaker.name} empty", fg=typer.colors.GREEN) | ||||
| 
 | ||||
| 
 | ||||
| @app.command() | ||||
| def show(ctx: typer.Context) -> None: | ||||
|     ctx.obj.show() | ||||
| 
 | ||||
| 
 | ||||
| @app.command() | ||||
| def graph(ctx: typer.Context) -> None: | ||||
|     pprint(ctx.obj.graph_data()) | ||||
| 
 | ||||
| 
 | ||||
| @app.command() | ||||
| def run( | ||||
|     ctx: typer.Context, | ||||
|     input: Annotated[Optional[List[str]], typer.Option(...)] = None, | ||||
|     start: Optional[str] = typer.Option(None), | ||||
|     end: Optional[str] = typer.Option(None), | ||||
| ) -> None: | ||||
|     if ctx.obj.seeds: | ||||
|         typer.secho("Seeding beakers", fg=typer.colors.GREEN) | ||||
|         ctx.obj.process_seeds() | ||||
|     has_data = any(ctx.obj.beakers.values()) | ||||
|     if not input and not has_data: | ||||
|         typer.secho("No data; pass --input to seed beaker(s)", fg=typer.colors.RED) | ||||
|         raise typer.Exit(1) | ||||
|     for input_str in input or []: | ||||
|         beaker, filename = input_str.split("=") | ||||
|         ctx.obj.csv_to_beaker(filename, beaker) | ||||
|     ctx.obj.run_once(start, end) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     app() | ||||
|  | @ -1,41 +0,0 @@ | |||
| import httpx | ||||
| from pydantic import BaseModel, Field | ||||
| import datetime | ||||
| 
 | ||||
| 
 | ||||
| class HttpResponse(BaseModel): | ||||
|     """ | ||||
|     Beaker data type that represents an HTTP response. | ||||
|     """ | ||||
| 
 | ||||
|     url: str | ||||
|     status_code: int | ||||
|     response_body: str | ||||
|     retrieved_at: datetime.datetime = Field(default_factory=datetime.datetime.now) | ||||
| 
 | ||||
| 
 | ||||
| class HttpRequest: | ||||
|     """ | ||||
|     Filter that converts from a beaker with a URL to a beaker with an HTTP response. | ||||
|     """ | ||||
| 
 | ||||
|     def __init__(self, beaker: str, field: str): | ||||
|         """ | ||||
|         Args: | ||||
|             beaker: The name of the beaker that contains the URL. | ||||
|             field: The name of the field in the beaker that contains the URL. | ||||
|         """ | ||||
|         self.beaker = beaker | ||||
|         self.field = field | ||||
| 
 | ||||
|     async def __call__(self, item: BaseModel) -> HttpResponse: | ||||
|         url = getattr(item, self.field) | ||||
| 
 | ||||
|         async with httpx.AsyncClient() as client: | ||||
|             response = await client.get(url) | ||||
| 
 | ||||
|         return HttpResponse( | ||||
|             url=url, | ||||
|             status_code=response.status_code, | ||||
|             response_body=response.text, | ||||
|         ) | ||||
|  | @ -1,319 +0,0 @@ | |||
| import csv | ||||
| import json | ||||
| import typer | ||||
| import inspect | ||||
| import sqlite3 | ||||
| import hashlib | ||||
| import asyncio | ||||
| import networkx  # type: ignore | ||||
| from collections import defaultdict, Counter | ||||
| from typing import Iterable, Callable, Type | ||||
| from pydantic import BaseModel, ConfigDict | ||||
| from structlog import get_logger | ||||
| 
 | ||||
| from .beakers import Beaker, SqliteBeaker, TempBeaker | ||||
| 
 | ||||
| log = get_logger() | ||||
| 
 | ||||
| 
 | ||||
| def get_sha512(filename: str) -> str: | ||||
|     with open(filename, "rb") as file: | ||||
|         return hashlib.sha512(file.read()).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| class Transform(BaseModel): | ||||
|     model_config = ConfigDict(frozen=True) | ||||
| 
 | ||||
|     name: str | ||||
|     transform_func: Callable | ||||
|     error_map: dict[tuple, str] | ||||
| 
 | ||||
| 
 | ||||
| class ErrorType(BaseModel): | ||||
|     item: BaseModel | ||||
|     exception: str | ||||
|     exc_type: str | ||||
| 
 | ||||
| 
 | ||||
| def if_cond_true(data_cond_tup: tuple[dict, bool]) -> dict | None: | ||||
|     return data_cond_tup[0] if data_cond_tup[1] else None | ||||
| 
 | ||||
| 
 | ||||
| def if_cond_false(data_cond_tup: tuple[dict, bool]) -> dict | None: | ||||
|     return data_cond_tup[0] if not data_cond_tup[1] else None | ||||
| 
 | ||||
| 
 | ||||
| class Recipe: | ||||
|     def __init__(self, name: str, db_name: str = "beakers.db"): | ||||
|         self.name = name | ||||
|         self.graph = networkx.DiGraph() | ||||
|         self.beakers: dict[str, Beaker] = {} | ||||
|         self.seeds: defaultdict[str, list[Iterable[BaseModel]]] = defaultdict(list) | ||||
|         self.db = sqlite3.connect(db_name) | ||||
|         cursor = self.db.cursor() | ||||
|         cursor.execute( | ||||
|             "CREATE TABLE IF NOT EXISTS _metadata (table_name TEXT PRIMARY KEY, data JSON)" | ||||
|         ) | ||||
| 
 | ||||
|     def __repr__(self) -> str: | ||||
|         return f"Recipe({self.name})" | ||||
| 
 | ||||
|     def add_beaker( | ||||
|         self, | ||||
|         name: str, | ||||
|         datatype: Type[BaseModel], | ||||
|         beaker_type: Type[Beaker] = SqliteBeaker, | ||||
|     ) -> Beaker: | ||||
|         self.graph.add_node(name, datatype=datatype) | ||||
|         if datatype is None: | ||||
|             self.beakers[name] = TempBeaker(name, datatype, self) | ||||
|         else: | ||||
|             self.beakers[name] = SqliteBeaker(name, datatype, self) | ||||
|         return self.beakers[name] | ||||
| 
 | ||||
|     def add_transform( | ||||
|         self, | ||||
|         from_beaker: str, | ||||
|         to_beaker: str, | ||||
|         transform_func: Callable, | ||||
|         *, | ||||
|         name: str | None = None, | ||||
|         error_map: dict[tuple, str] | None = None, | ||||
|     ) -> None: | ||||
|         if name is None: | ||||
|             name = transform_func.__name__ | ||||
|             if name == "<lambda>": | ||||
|                 name = "λ" | ||||
|         transform = Transform( | ||||
|             name=name, | ||||
|             transform_func=transform_func, | ||||
|             error_map=error_map or {}, | ||||
|         ) | ||||
|         self.graph.add_edge( | ||||
|             from_beaker, | ||||
|             to_beaker, | ||||
|             transform=transform, | ||||
|         ) | ||||
| 
 | ||||
|     def add_conditional( | ||||
|         self, | ||||
|         from_beaker: str, | ||||
|         condition_func: Callable, | ||||
|         if_true: str, | ||||
|         if_false: str = "", | ||||
|     ) -> None: | ||||
|         # first add a transform to evaluate the conditional | ||||
|         if condition_func.__name__ == "<lambda>": | ||||
|             cond_name = f"cond-{from_beaker}" | ||||
|         else: | ||||
|             cond_name = f"cond-{from_beaker}-{condition_func.__name__}" | ||||
|         self.add_beaker(cond_name, None) | ||||
|         self.add_transform( | ||||
|             from_beaker, | ||||
|             cond_name, | ||||
|             lambda data: (data, condition_func(data)), | ||||
|             name=cond_name, | ||||
|         ) | ||||
| 
 | ||||
|         # then add two filtered paths that remove the condition result | ||||
|         self.add_beaker(if_true, None) | ||||
|         self.add_transform( | ||||
|             cond_name, | ||||
|             if_true, | ||||
|             if_cond_true, | ||||
|         ) | ||||
|         if if_false: | ||||
|             self.add_transform( | ||||
|                 cond_name, | ||||
|                 if_false, | ||||
|                 if_cond_false, | ||||
|             ) | ||||
| 
 | ||||
|     def add_seed(self, beaker_name: str, data: Iterable[BaseModel]) -> None: | ||||
|         self.seeds[beaker_name].append(data) | ||||
| 
 | ||||
|     def process_seeds(self) -> None: | ||||
|         log.info("process_seeds", recipe=self.name) | ||||
|         for beaker_name, seeds in self.seeds.items(): | ||||
|             for seed in seeds: | ||||
|                 self.beakers[beaker_name].add_items(seed) | ||||
| 
 | ||||
|     def get_metadata(self, table_name: str) -> dict: | ||||
|         cursor = self.db.cursor() | ||||
|         cursor.execute( | ||||
|             "SELECT data FROM _metadata WHERE table_name = ?", | ||||
|             (table_name,), | ||||
|         ) | ||||
|         try: | ||||
|             data = cursor.fetchone()["data"] | ||||
|             log.debug("get_metadata", table_name=table_name, data=data) | ||||
|             return json.loads(data) | ||||
|         except TypeError: | ||||
|             log.debug("get_metadata", table_name=table_name, data={}) | ||||
|             return {} | ||||
| 
 | ||||
|     def save_metadata(self, table_name: str, data: dict) -> None: | ||||
|         data_json = json.dumps(data) | ||||
|         log.info("save_metadata", table_name=table_name, data=data_json) | ||||
|         # sqlite upsert | ||||
|         cursor = self.db.cursor() | ||||
|         cursor.execute( | ||||
|             "INSERT INTO _metadata (table_name, data) VALUES (?, ?) ON CONFLICT(table_name) DO UPDATE SET data = ?", | ||||
|             (table_name, data_json, data_json), | ||||
|         ) | ||||
|         self.db.commit() | ||||
| 
 | ||||
|     def csv_to_beaker(self, filename: str, beaker_name: str) -> None: | ||||
|         beaker = self.beakers[beaker_name] | ||||
|         lg = log.bind(beaker=beaker, filename=filename) | ||||
|         # three cases: empty, match, mismatch | ||||
|         # case 1: empty | ||||
|         if len(beaker) == 0: | ||||
|             with open(filename, "r") as file: | ||||
|                 reader = csv.DictReader(file) | ||||
|                 added = 0 | ||||
|                 for row in reader: | ||||
|                     beaker.add_item(beaker.model(**row)) | ||||
|                     added += 1 | ||||
|             lg.info("from_csv", case="empty", added=added) | ||||
|             meta = self.get_metadata(beaker.name) | ||||
|             meta["sha512"] = get_sha512(filename) | ||||
|             self.save_metadata(beaker.name, meta) | ||||
|         else: | ||||
|             old_sha = self.get_metadata(beaker.name).get("sha512") | ||||
|             new_sha = get_sha512(filename) | ||||
|             if old_sha != new_sha: | ||||
|                 # case 3: mismatch | ||||
|                 lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha) | ||||
|                 raise Exception("sha512 mismatch") | ||||
|             else: | ||||
|                 # case 2: match | ||||
|                 lg.info("from_csv", case="match") | ||||
| 
 | ||||
|     def show(self) -> None: | ||||
|         seed_count = Counter(self.seeds.keys()) | ||||
|         typer.secho("Seeds", fg=typer.colors.GREEN) | ||||
|         for beaker, count in seed_count.items(): | ||||
|             typer.secho(f"  {beaker} ({count})", fg=typer.colors.GREEN) | ||||
|         graph_data = self.graph_data() | ||||
|         for node in graph_data: | ||||
|             if node["temp"]: | ||||
|                 typer.secho(node["name"], fg=typer.colors.CYAN) | ||||
|             else: | ||||
|                 typer.secho( | ||||
|                     f"{node['name']} ({node['len']})", | ||||
|                     fg=typer.colors.GREEN if node["len"] else typer.colors.YELLOW, | ||||
|                 ) | ||||
|             for edge in node["edges"]: | ||||
|                 print(f"  -({edge['transform'].name})-> {edge['to_beaker']}") | ||||
|                 for k, v in edge["transform"].error_map.items(): | ||||
|                     if isinstance(k, tuple): | ||||
|                         typer.secho( | ||||
|                             f"    {' '.join(c.__name__ for c in k)} -> {v}", | ||||
|                             fg=typer.colors.RED, | ||||
|                         ) | ||||
|                     else: | ||||
|                         typer.secho(f"    {k.__name__} -> {v}", fg=typer.colors.RED) | ||||
| 
 | ||||
|     def graph_data(self) -> list[dict]: | ||||
|         nodes = {} | ||||
| 
 | ||||
|         for node in networkx.topological_sort(self.graph): | ||||
|             beaker = self.beakers[node] | ||||
|             temp = isinstance(beaker, TempBeaker) | ||||
| 
 | ||||
|             nodes[node] = { | ||||
|                 "name": node, | ||||
|                 "temp": temp, | ||||
|                 "len": len(beaker), | ||||
|                 "edges": [], | ||||
|             } | ||||
| 
 | ||||
|             rank = 0 | ||||
|             for from_b, to_b, edge in self.graph.in_edges(node, data=True): | ||||
|                 if nodes[from_b]["rank"] > rank: | ||||
|                     rank = nodes[from_b]["rank"] | ||||
|             nodes[node]["rank"] = rank + 1 | ||||
| 
 | ||||
|             for from_b, to_b, edge in self.graph.out_edges(node, data=True): | ||||
|                 edge["to_beaker"] = to_b | ||||
|                 nodes[node]["edges"].append(edge) | ||||
| 
 | ||||
|         # all data collected for display | ||||
|         return sorted(nodes.values(), key=lambda x: (x["rank"], x["name"])) | ||||
| 
 | ||||
|     def run_once( | ||||
|         self, start_beaker: str | None = None, end_beaker: str | None = None | ||||
|     ) -> None: | ||||
|         log.info("run_once", recipe=self) | ||||
|         loop = asyncio.get_event_loop() | ||||
| 
 | ||||
|         started = False if start_beaker else True | ||||
| 
 | ||||
|         # go through each node in forward order, pushing data | ||||
|         for node in networkx.topological_sort(self.graph): | ||||
|             # only process nodes between start and end | ||||
|             if not started: | ||||
|                 if node == start_beaker: | ||||
|                     started = True | ||||
|                     log.info("partial run start", node=node) | ||||
|                 else: | ||||
|                     log.info("partial run skip", node=node, waiting_for=start_beaker) | ||||
|                     continue | ||||
|             if end_beaker and node == end_beaker: | ||||
|                 log.info("partial run end", node=node) | ||||
|                 break | ||||
| 
 | ||||
|             # get outbound edges | ||||
|             edges = self.graph.out_edges(node, data=True) | ||||
| 
 | ||||
|             for from_b, to_b, edge in edges: | ||||
|                 transform = edge["transform"] | ||||
| 
 | ||||
|                 from_beaker = self.beakers[from_b] | ||||
|                 to_beaker = self.beakers[to_b] | ||||
|                 already_processed = from_beaker.id_set() & to_beaker.id_set() | ||||
| 
 | ||||
|                 log.info( | ||||
|                     "transform", | ||||
|                     from_b=from_b, | ||||
|                     to_b=to_b, | ||||
|                     to_process=len(from_beaker) - len(already_processed), | ||||
|                     already_processed=len(already_processed), | ||||
|                     transform=edge["transform"].name, | ||||
|                 ) | ||||
| 
 | ||||
|                 # convert coroutine to function | ||||
|                 if inspect.iscoroutinefunction(transform.transform_func): | ||||
|                     t_func = lambda x: loop.run_until_complete( | ||||
|                         transform.transform_func(x) | ||||
|                     ) | ||||
|                 else: | ||||
|                     t_func = transform.transform_func | ||||
| 
 | ||||
|                 for id, item in from_beaker.items(): | ||||
|                     if id in already_processed: | ||||
|                         continue | ||||
|                     try: | ||||
|                         transformed = t_func(item) | ||||
|                         if transformed: | ||||
|                             to_beaker.add_item(transformed, id) | ||||
|                     except Exception as e: | ||||
|                         for ( | ||||
|                             error_types, | ||||
|                             error_beaker_name, | ||||
|                         ) in transform.error_map.items(): | ||||
|                             if isinstance(e, error_types): | ||||
|                                 error_beaker = self.beakers[error_beaker_name] | ||||
|                                 error_beaker.add_item( | ||||
|                                     ErrorType( | ||||
|                                         item=item, | ||||
|                                         exception=str(e), | ||||
|                                         exc_type=str(type(e)), | ||||
|                                     ), | ||||
|                                     id, | ||||
|                                 ) | ||||
|                                 break | ||||
|                         else: | ||||
|                             # no error handler, re-raise | ||||
|                             raise | ||||
|  | @ -1,23 +0,0 @@ | |||
| import uuid | ||||
| from pydantic import BaseModel | ||||
| 
 | ||||
| 
 | ||||
| class Record: | ||||
|     _reserved_names = ("id",) | ||||
| 
 | ||||
|     def __init__(self, id: str | None = None): | ||||
|         self._id = id if id else str(uuid.uuid1()) | ||||
|         self._data: dict[str, BaseModel] = {} | ||||
| 
 | ||||
|     def __getattr__(self, name: str) -> str | BaseModel: | ||||
|         if name == "id": | ||||
|             return self._id | ||||
|         return self._data[name] | ||||
| 
 | ||||
|     def __setattr__(self, name: str, value: BaseModel) -> None: | ||||
|         if name.startswith("_"): | ||||
|             super().__setattr__(name, value) | ||||
|         elif name not in self._data and name not in self._reserved_names: | ||||
|             self._data[name] = value | ||||
|         else: | ||||
|             raise AttributeError(f"DataObject attribute {name} already exists") | ||||
|  | @ -1,34 +0,0 @@ | |||
| from beakers.record import Record | ||||
| import pytest | ||||
| 
 | ||||
| 
 | ||||
| def test_record_id_autogen(): | ||||
|     r = Record() | ||||
|     assert len(r.id) == 36 | ||||
|     r2, r3 = Record(), Record() | ||||
|     assert r2.id != r3.id | ||||
| 
 | ||||
| 
 | ||||
| def test_record_id_assign(): | ||||
|     r = Record(id="test") | ||||
|     assert r.id == "test" | ||||
| 
 | ||||
| 
 | ||||
| def test_record_setattr_good(): | ||||
|     r = Record() | ||||
|     r.attrib = "set" | ||||
|     assert r.attrib == "set" | ||||
| 
 | ||||
| 
 | ||||
| def test_record_setattr_duplicate(): | ||||
|     r = Record() | ||||
|     r.attrib = "set" | ||||
|     with pytest.raises(AttributeError): | ||||
|         r.attrib = "changed" | ||||
|     assert r.attrib == "set" | ||||
| 
 | ||||
| 
 | ||||
| def test_record_setattr_id(): | ||||
|     r = Record() | ||||
|     with pytest.raises(AttributeError): | ||||
|         r.id = "changed" | ||||
		Loading…
	
		Reference in a new issue
	
	 James Turk
						James Turk