WIP graph recipe
This commit is contained in:
parent
7381eb9f30
commit
8e4358c1e9
@ -1,41 +0,0 @@
|
|||||||
import sqlite3
|
|
||||||
import csv
|
|
||||||
import json
|
|
||||||
import hashlib
|
|
||||||
|
|
||||||
from structlog import get_logger
|
|
||||||
|
|
||||||
log = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
class Beaker:
|
|
||||||
def __init__(self, table_name: str):
|
|
||||||
self.table_name = table_name
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"Beaker({self.table_name})"
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
self.cursor.execute(f"SELECT data FROM {self.table_name}")
|
|
||||||
data = self.cursor.fetchall()
|
|
||||||
for item in data:
|
|
||||||
yield item["id"], item["data"]
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
self.cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
|
|
||||||
return self.cursor.fetchone()[0]
|
|
||||||
|
|
||||||
def connect_to_db(self, db) -> None:
|
|
||||||
self.db = db
|
|
||||||
self.cursor = self.db.cursor()
|
|
||||||
self.cursor.row_factory = sqlite3.Row
|
|
||||||
# create table if it doesn't exist
|
|
||||||
self.cursor.execute(
|
|
||||||
f"CREATE TABLE IF NOT EXISTS {self.table_name} (id INTEGER PRIMARY KEY, data JSON, from_table TEXT NULL, from_id INTEGER NULL)"
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_item(self, item: dict, from_table=None, from_id=None) -> None:
|
|
||||||
self.cursor.execute(
|
|
||||||
f"INSERT INTO {self.table_name} (data) VALUES (?)", (json.dumps(item),)
|
|
||||||
)
|
|
||||||
self.db.commit()
|
|
19
src/beakers/filters.py
Normal file
19
src/beakers/filters.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from .recipe import Beaker
|
||||||
|
|
||||||
|
|
||||||
|
class ConditionalFilter:
|
||||||
|
def __init__(self, condition: callable, true_path: Beaker, false_path: Beaker):
|
||||||
|
self.condition = condition
|
||||||
|
self.true_path = true_path
|
||||||
|
self.false_path = false_path
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return (
|
||||||
|
f"ConditionalFilter({self.condition}, {self.true_path}, {self.false_path})"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __call__(self, item):
|
||||||
|
if self.condition(item):
|
||||||
|
self.true_path.add_item(item)
|
||||||
|
else:
|
||||||
|
self.false_path.add_item(item)
|
@ -2,17 +2,72 @@ import csv
|
|||||||
import json
|
import json
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import hashlib
|
import hashlib
|
||||||
from .beaker import Beaker
|
import asyncio
|
||||||
|
from dataclasses import dataclass
|
||||||
from structlog import get_logger
|
from structlog import get_logger
|
||||||
|
|
||||||
log = get_logger()
|
log = get_logger()
|
||||||
|
|
||||||
|
"""
|
||||||
|
Implementation thoughts
|
||||||
|
|
||||||
class Pour:
|
Originally beaker was the central object, but pretty quickly
|
||||||
def __init__(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
|
most functionality was moved to recipe. Now beaker is just a
|
||||||
self.from_beaker = from_beaker
|
thin wrapper around a sqlite table.
|
||||||
self.to_beaker = to_beaker
|
|
||||||
self.transform = transform
|
Recipe is the central object. It contains a list of pours,
|
||||||
|
which are the instructions for how to transform data from one
|
||||||
|
beaker to another.
|
||||||
|
|
||||||
|
Recipe also contains a list of beakers, which are just sqlite
|
||||||
|
tables. Beakers are created by the recipe, and are passed to
|
||||||
|
pours as arguments.
|
||||||
|
|
||||||
|
Beakers right now know they are stored in SQLite, but I think
|
||||||
|
that will be abstracted away. Beakers should be able to be
|
||||||
|
stored in any database, or on disk.
|
||||||
|
|
||||||
|
Core functionality of a beaker:
|
||||||
|
- initialize (create table if it doesn't exist)
|
||||||
|
- add item with optional link to another item
|
||||||
|
- should this be links? (probably)
|
||||||
|
- iterate over items
|
||||||
|
- get item by id
|
||||||
|
- count items
|
||||||
|
|
||||||
|
Ultimately, this is a graph problem. Beakers are nodes, and
|
||||||
|
pours are edges. The recipe is the graph.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class Beaker:
|
||||||
|
def __init__(self, table_name: str, recipe):
|
||||||
|
self.table_name = table_name
|
||||||
|
self.recipe = recipe
|
||||||
|
|
||||||
|
# create table if it doesn't exist
|
||||||
|
self.recipe.cursor.execute(
|
||||||
|
f"CREATE TABLE IF NOT EXISTS {self.table_name} (id INTEGER PRIMARY KEY, data JSON, from_table TEXT NULL, from_id INTEGER NULL)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"Beaker({self.table_name})"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
self.recipe.cursor.execute(f"SELECT id, data FROM {self.table_name}")
|
||||||
|
data = self.recipe.cursor.fetchall()
|
||||||
|
for item in data:
|
||||||
|
yield item["id"], json.loads(item["data"])
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
self.recipe.cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
|
||||||
|
return self.recipe.cursor.fetchone()[0]
|
||||||
|
|
||||||
|
def add_item(self, item: dict, from_table=None, from_id=None) -> None:
|
||||||
|
self.recipe.cursor.execute(
|
||||||
|
f"INSERT INTO {self.table_name} (data) VALUES (?)", (json.dumps(item),)
|
||||||
|
)
|
||||||
|
self.recipe.cursor.commit()
|
||||||
|
|
||||||
|
|
||||||
def get_sha512(filename: str) -> str:
|
def get_sha512(filename: str) -> str:
|
||||||
@ -20,10 +75,27 @@ def get_sha512(filename: str) -> str:
|
|||||||
return hashlib.sha512(file.read()).hexdigest()
|
return hashlib.sha512(file.read()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Pour:
|
||||||
|
from_beaker: str
|
||||||
|
to_beaker: str
|
||||||
|
transform: callable
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Split:
|
||||||
|
from_beaker: str
|
||||||
|
condition: callable
|
||||||
|
if_true: str
|
||||||
|
if_false: str
|
||||||
|
|
||||||
|
|
||||||
class Recipe:
|
class Recipe:
|
||||||
def __init__(self, name: str):
|
def __init__(self, name: str):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.beakers = {}
|
||||||
self.pours = []
|
self.pours = []
|
||||||
|
self.splits = []
|
||||||
self.db = sqlite3.connect("beakers.db")
|
self.db = sqlite3.connect("beakers.db")
|
||||||
self.cursor = self.db.cursor()
|
self.cursor = self.db.cursor()
|
||||||
self.cursor.row_factory = sqlite3.Row
|
self.cursor.row_factory = sqlite3.Row
|
||||||
@ -62,7 +134,6 @@ class Recipe:
|
|||||||
self.pours.append(pour)
|
self.pours.append(pour)
|
||||||
|
|
||||||
def csv_to_beaker(self, filename: str, beaker: Beaker) -> None:
|
def csv_to_beaker(self, filename: str, beaker: Beaker) -> None:
|
||||||
beaker.connect_to_db(self.db)
|
|
||||||
lg = log.bind(beaker=beaker, filename=filename)
|
lg = log.bind(beaker=beaker, filename=filename)
|
||||||
# three cases: empty, match, mismatch
|
# three cases: empty, match, mismatch
|
||||||
# case 1: empty
|
# case 1: empty
|
||||||
@ -78,7 +149,7 @@ class Recipe:
|
|||||||
meta["sha512"] = get_sha512(filename)
|
meta["sha512"] = get_sha512(filename)
|
||||||
self.save_metadata(beaker.table_name, meta)
|
self.save_metadata(beaker.table_name, meta)
|
||||||
else:
|
else:
|
||||||
old_sha = self.get_metadata(beaker.table_name).get("sha512")
|
old_sha = self.get_metadata(beaker).get("sha512")
|
||||||
new_sha = get_sha512(filename)
|
new_sha = get_sha512(filename)
|
||||||
if old_sha != new_sha:
|
if old_sha != new_sha:
|
||||||
# case 3: mismatch
|
# case 3: mismatch
|
||||||
@ -89,23 +160,36 @@ class Recipe:
|
|||||||
lg.info("from_csv", case="match")
|
lg.info("from_csv", case="match")
|
||||||
return beaker
|
return beaker
|
||||||
|
|
||||||
def add_pour(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
|
def declare_beaker(self, name: str, temp=True):
|
||||||
|
beaker = Beaker(name, self)
|
||||||
|
self.beakers[name] = beaker
|
||||||
|
if temp:
|
||||||
|
self.cursor.execute(f"DROP TABLE IF EXISTS {name}")
|
||||||
|
return beaker
|
||||||
|
|
||||||
|
def add_pour(self, from_beaker: str, to_beaker: str, transform: callable):
|
||||||
pour = Pour(from_beaker, to_beaker, transform)
|
pour = Pour(from_beaker, to_beaker, transform)
|
||||||
self.pours.append(pour)
|
self.pours.append(pour)
|
||||||
|
|
||||||
|
def add_split(
|
||||||
|
self, from_beaker: str, condition: callable, if_true: str, if_false: str
|
||||||
|
):
|
||||||
|
split = Split(from_beaker, condition, if_true, if_false)
|
||||||
|
self.splits.append(split)
|
||||||
|
return split
|
||||||
|
|
||||||
def run_linearly(self):
|
def run_linearly(self):
|
||||||
log.info("recipe", recipe=self)
|
log.info("recipe", recipe=self)
|
||||||
for pour in self.pours:
|
loop = asyncio.get_event_loop()
|
||||||
pour.from_beaker.connect_to_db(self.db)
|
|
||||||
pour.to_beaker.connect_to_db(self.db)
|
|
||||||
|
|
||||||
|
for pour in self.pours:
|
||||||
log.info(
|
log.info(
|
||||||
"pour",
|
"pour",
|
||||||
from_beaker=pour.from_beaker,
|
from_beaker=pour.from_beaker,
|
||||||
to_beaker=pour.to_beaker,
|
to_beaker=pour.to_beaker,
|
||||||
to_pour=len(pour.from_beaker),
|
to_pour=len(pour.from_beaker),
|
||||||
)
|
)
|
||||||
for id, item in pour.from_beaker:
|
for id, item in pour.from_beaker.items():
|
||||||
log.info("pour_item", id=id, item=item)
|
log.info("pour_item", id=id, item=item)
|
||||||
transformed = pour.transform(item)
|
transformed = loop.run_until_complete(pour.transform(item))
|
||||||
pour.to_beaker.add_item(transformed, pour.from_beaker.table_name, id)
|
pour.to_beaker.add_item(transformed, pour.from_beaker.table_name, id)
|
||||||
|
@ -1,12 +1,9 @@
|
|||||||
import csv
|
import httpx
|
||||||
from beakers.recipe import Recipe
|
from beakers.recipe import Recipe, Beaker
|
||||||
from beakers.beaker import Beaker
|
|
||||||
|
|
||||||
urls = Beaker("urls")
|
|
||||||
responses = Beaker("responses")
|
|
||||||
|
|
||||||
|
|
||||||
async def add_response(obj_with_url):
|
async def add_response(obj_with_url):
|
||||||
|
print(obj_with_url["url"])
|
||||||
url = obj_with_url["url"]
|
url = obj_with_url["url"]
|
||||||
response = await httpx.get(url)
|
response = await httpx.get(url)
|
||||||
return {
|
return {
|
||||||
@ -16,10 +13,19 @@ async def add_response(obj_with_url):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
agencies = Beaker("agencies")
|
# current thinking, beakers exist within a recipe
|
||||||
responses = Beaker("responses")
|
|
||||||
recipe = Recipe("fetch urls")
|
recipe = Recipe("fetch urls")
|
||||||
recipe.csv_to_beaker("agencies.csv", agencies)
|
recipe.declare_beaker("agencies")
|
||||||
recipe.add_pour(agencies, responses, add_response)
|
recipe.declare_beaker("responses")
|
||||||
|
recipe.declare_beaker("good_urls", temp=True)
|
||||||
|
recipe.declare_beaker("missing_urls", temp=True)
|
||||||
|
recipe.csv_to_beaker("agencies.csv", "agencies")
|
||||||
|
recipe.add_split(
|
||||||
|
"agencies",
|
||||||
|
lambda x: x["url"].startswith("http"),
|
||||||
|
if_true="good_urls",
|
||||||
|
if_false="missing_urls",
|
||||||
|
)
|
||||||
|
recipe.add_pour("good_urls", "responses", add_response)
|
||||||
|
|
||||||
recipe.run_linearly()
|
recipe.run_linearly()
|
||||||
|
Loading…
Reference in New Issue
Block a user