From 11003ef87244778ebfe6d59872a5bda38f740a68 Mon Sep 17 00:00:00 2001 From: James Turk Date: Tue, 11 Jul 2023 00:03:24 -0500 Subject: [PATCH] working on article example --- examples/articles.py | 71 +++++++++++++++++++++++++++++++++++++++++ src/beakers/__init__.py | 1 + src/beakers/filters.py | 9 ++++++ src/beakers/http.py | 31 ++++++++++++++++++ src/beakers/recipe.py | 22 +++++++------ 5 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 examples/articles.py create mode 100644 src/beakers/filters.py create mode 100644 src/beakers/http.py diff --git a/examples/articles.py b/examples/articles.py new file mode 100644 index 0000000..728b0f7 --- /dev/null +++ b/examples/articles.py @@ -0,0 +1,71 @@ +import datetime +from pydantic import BaseModel +import lxml +from beakers import Recipe +from beakers.filters import ConditionalFilter +from beakers.http import HttpRequest + + +class ArticleURL(BaseModel): + url: str + source: str + + +class HttpResponse(BaseModel): + url: str + status: int + content: str + retrieved_at: datetime.datetime + + +class Article(BaseModel): + title: str + text: str + image_urls: list[str] + + +def is_npr(item) -> bool: + return item.url.source == "npr" + + +def extract_npr_article(item) -> Article: + doc = lxml.html.fromstring(item.response.content) + title = doc.cssselect(".story-title")[0].text() + text = doc.cssselect(".paragraphs-container").text() + return Article( + title=title, + text=text, + image_urls=[], + ) + + +recipe = Recipe("newsface", "newsface.db") +recipe.add_beaker("url", ArticleURL) +recipe.add_beaker("response", HttpResponse) +recipe.add_beaker("article", Article) +recipe.add_transform("url", "response", HttpRequest) +recipe.add_conditional( + "response", + is_npr, + "npr_article", +) +recipe.add_transform( + "npr_article", + "article", + extract_npr_article, +) + + +npr_examples = [ + ArticleURL(url="https://text.npr.org/1186770075", source="npr"), + ArticleURL(url="https://text.npr.org/1186525577", source="npr"), + ArticleURL(url="https://text.npr.org/1185780577", source="npr"), +] +other = [ + ArticleURL(url="https://nytimes.com", source="nytimes"), +] + +# recipe.add_seed( +# "article_url", +# npr_examples + other, +# ) diff --git a/src/beakers/__init__.py b/src/beakers/__init__.py index e69de29..13495de 100644 --- a/src/beakers/__init__.py +++ b/src/beakers/__init__.py @@ -0,0 +1 @@ +from .recipe import Recipe diff --git a/src/beakers/filters.py b/src/beakers/filters.py new file mode 100644 index 0000000..6a259f2 --- /dev/null +++ b/src/beakers/filters.py @@ -0,0 +1,9 @@ +class ConditionalFilter: + def __init__(self, condition): + self.condition = condition + + def __call__(self, item): + if self.condition(item): + return item + else: + return None diff --git a/src/beakers/http.py b/src/beakers/http.py new file mode 100644 index 0000000..420886b --- /dev/null +++ b/src/beakers/http.py @@ -0,0 +1,31 @@ +import httpx +import pydantic +import datetime + + +class HttpResponse(pydantic.BaseModel): + url: str + status_code: int + response_body: str + retrieved_at: datetime.datetime = pydantic.Field( + default_factory=datetime.datetime.now + ) + + +class HttpRequest: + def __init__(self, beaker: str, field: str): + self.beaker = beaker + self.field = field + + async def __call__(self, item) -> HttpResponse: + bkr = getattr(item, self.beaker) + url = getattr(bkr, self.field) + + async with httpx.AsyncClient() as client: + response = await client.get(url) + + return HttpResponse( + url=url, + status_code=response.status_code, + response_body=response.text, + ) diff --git a/src/beakers/recipe.py b/src/beakers/recipe.py index 87eb933..98cd2f0 100644 --- a/src/beakers/recipe.py +++ b/src/beakers/recipe.py @@ -48,9 +48,9 @@ class Recipe: def __repr__(self) -> str: return f"Recipe({self.name})" - def add_beaker(self, name: str, temp: bool = False) -> Beaker: - self.graph.add_node(name) - if temp: + def add_beaker(self, name: str, datatype: type | None) -> Beaker: + self.graph.add_node(name, datatype=datatype) + if datatype is None: self.beakers[name] = TempBeaker(name, self) else: self.beakers[name] = SqliteBeaker(name, self) @@ -85,14 +85,14 @@ class Recipe: from_beaker: str, condition_func: callable, if_true: str, - if_false: str, + if_false: str = "", ) -> None: # first add a transform to evaluate the conditional if condition_func.__name__ == "": cond_name = f"cond-{from_beaker}" else: cond_name = f"cond-{from_beaker}-{condition_func.__name__}" - self.add_beaker(cond_name, temp=True) + self.add_beaker(cond_name, None) self.add_transform( from_beaker, cond_name, @@ -101,16 +101,18 @@ class Recipe: ) # then add two filtered paths that remove the condition result + self.add_beaker(if_true, None) self.add_transform( cond_name, if_true, if_cond_true, ) - self.add_transform( - cond_name, - if_false, - if_cond_false, - ) + if if_false: + self.add_transform( + cond_name, + if_false, + if_cond_false, + ) def get_metadata(self, table_name) -> dict: cursor = self.db.cursor()