networkx version

This commit is contained in:
James Turk 2023-05-07 19:34:12 -05:00
parent b688df1c02
commit f98b0140b6
2 changed files with 20 additions and 66 deletions

View File

@ -7,6 +7,8 @@ from dataclasses import dataclass
import networkx import networkx
from structlog import get_logger from structlog import get_logger
from .beakers import Beaker, SqliteBeaker, TempBeaker
log = get_logger() log = get_logger()
""" """
@ -41,54 +43,11 @@ pours are edges. The recipe is the graph.
""" """
# class Beaker:
# def __init__(self, table_name: str, recipe):
# self.table_name = table_name
# self.recipe = recipe
# # create table if it doesn't exist
# self.recipe.cursor.execute(
# f"CREATE TABLE IF NOT EXISTS {self.table_name} (id INTEGER PRIMARY KEY, data JSON, from_table TEXT NULL, from_id INTEGER NULL)"
# )
# def __repr__(self):
# return f"Beaker({self.table_name})"
# def items(self):
# self.recipe.cursor.execute(f"SELECT id, data FROM {self.table_name}")
# data = self.recipe.cursor.fetchall()
# for item in data:
# yield item["id"], json.loads(item["data"])
# def __len__(self):
# self.recipe.cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
# return self.recipe.cursor.fetchone()[0]
# def add_item(self, item: dict, from_table=None, from_id=None) -> None:
# self.recipe.cursor.execute(
# f"INSERT INTO {self.table_name} (data) VALUES (?)", (json.dumps(item),)
# )
# self.recipe.cursor.commit()
def get_sha512(filename: str) -> str: def get_sha512(filename: str) -> str:
with open(filename, "rb") as file: with open(filename, "rb") as file:
return hashlib.sha512(file.read()).hexdigest() return hashlib.sha512(file.read()).hexdigest()
@dataclass(eq=True, frozen=True)
class Beaker:
"""
A beaker is a node in the graph.
They can correspond to tables in the database,
or they can be temporary.
"""
name: str
temporary: bool = False
@dataclass(eq=True, frozen=True) @dataclass(eq=True, frozen=True)
class Pour: class Pour:
""" """
@ -130,17 +89,17 @@ class Recipe:
return f"Recipe({self.name})" return f"Recipe({self.name})"
def add_beaker(self, name: str, temp: bool = False) -> Beaker: def add_beaker(self, name: str, temp: bool = False) -> Beaker:
beaker = Beaker(name, temporary=temp) self.graph.add_node(name)
self.graph.add_node(beaker) if temp:
self.beakers[name] = beaker self.beakers[name] = TempBeaker(name, self)
return beaker else:
self.beakers[name] = SqliteBeaker(name, self)
return self.beakers[name]
def add_pour( def add_pour(
self, from_beaker: str, to_beaker: str, transform_func: callable self, from_beaker: str, to_beaker: str, transform_func: callable
) -> None: ) -> None:
self.graph.add_edge( self.graph.add_edge(from_beaker, to_beaker, transform=transform_func)
self.beakers[from_beaker], self.beakers[to_beaker], transform=transform_func
)
def add_conditional( def add_conditional(
self, self,
@ -151,7 +110,7 @@ class Recipe:
) -> None: ) -> None:
# first add a transform to evaluate the conditional # first add a transform to evaluate the conditional
cond_name = f"cond-{from_beaker}-{condition_func.__name__}" cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
cond = self.add_beaker(cond_name, temp=True) self.add_beaker(cond_name, temp=True)
self.add_pour( self.add_pour(
from_beaker, from_beaker,
cond_name, cond_name,
@ -160,13 +119,13 @@ class Recipe:
# then add two filtered paths that remove the condition result # then add two filtered paths that remove the condition result
self.graph.add_edge( self.graph.add_edge(
cond, cond_name,
self.beakers[if_true], if_true,
filter_func=lambda data, condition: data if condition else None, filter_func=lambda data, condition: data if condition else None,
) )
self.graph.add_edge( self.graph.add_edge(
cond, cond_name,
self.beakers[if_false], if_false,
filter_func=lambda data, condition: data if not condition else None, filter_func=lambda data, condition: data if not condition else None,
) )
@ -206,11 +165,11 @@ class Recipe:
beaker.add_item(row) beaker.add_item(row)
added += 1 added += 1
lg.info("from_csv", case="empty", added=added) lg.info("from_csv", case="empty", added=added)
meta = self.get_metadata(beaker.table_name) meta = self.get_metadata(beaker.name)
meta["sha512"] = get_sha512(filename) meta["sha512"] = get_sha512(filename)
self.save_metadata(beaker.table_name, meta) self.save_metadata(beaker.name, meta)
else: else:
old_sha = self.get_metadata(beaker).get("sha512") old_sha = self.get_metadata(beaker.name).get("sha512")
new_sha = get_sha512(filename) new_sha = get_sha512(filename)
if old_sha != new_sha: if old_sha != new_sha:
# case 3: mismatch # case 3: mismatch
@ -226,12 +185,7 @@ class Recipe:
Solve the DAG by topological sort. Solve the DAG by topological sort.
""" """
for node in networkx.topological_sort(self.graph): for node in networkx.topological_sort(self.graph):
if isinstance(node, Beaker):
print(node) print(node)
elif isinstance(node, Conditional):
print(node)
else:
raise Exception("unknown node type")
def run_linearly(self): def run_linearly(self):
log.info("recipe", recipe=self) log.info("recipe", recipe=self)
@ -247,4 +201,4 @@ class Recipe:
for id, item in pour.from_beaker.items(): for id, item in pour.from_beaker.items():
log.info("pour_item", id=id, item=item) log.info("pour_item", id=id, item=item)
transformed = loop.run_until_complete(pour.transform(item)) transformed = loop.run_until_complete(pour.transform(item))
pour.to_beaker.add_item(transformed, pour.from_beaker.table_name, id) pour.to_beaker.add_item(transformed, pour.from_beaker.name, id)

View File

@ -19,7 +19,6 @@ recipe.add_beaker("agencies")
recipe.add_beaker("responses") recipe.add_beaker("responses")
recipe.add_beaker("good_urls", temp=True) recipe.add_beaker("good_urls", temp=True)
recipe.add_beaker("missing_urls", temp=True) recipe.add_beaker("missing_urls", temp=True)
# recipe.csv_to_beaker("agencies.csv", "agencies")
recipe.add_conditional( recipe.add_conditional(
"agencies", "agencies",
lambda x: x["url"].startswith("http"), lambda x: x["url"].startswith("http"),
@ -28,4 +27,5 @@ recipe.add_conditional(
) )
recipe.add_pour("good_urls", "responses", add_response) recipe.add_pour("good_urls", "responses", add_response)
recipe.csv_to_beaker("agencies.csv", "agencies")
recipe.solve_dag() recipe.solve_dag()