working on article example

This commit is contained in:
James Turk 2023-07-11 00:03:24 -05:00
parent 046c9e453b
commit 11003ef872
5 changed files with 124 additions and 10 deletions

71
examples/articles.py Normal file
View File

@ -0,0 +1,71 @@
import datetime
from pydantic import BaseModel
import lxml
from beakers import Recipe
from beakers.filters import ConditionalFilter
from beakers.http import HttpRequest
class ArticleURL(BaseModel):
url: str
source: str
class HttpResponse(BaseModel):
url: str
status: int
content: str
retrieved_at: datetime.datetime
class Article(BaseModel):
title: str
text: str
image_urls: list[str]
def is_npr(item) -> bool:
return item.url.source == "npr"
def extract_npr_article(item) -> Article:
doc = lxml.html.fromstring(item.response.content)
title = doc.cssselect(".story-title")[0].text()
text = doc.cssselect(".paragraphs-container").text()
return Article(
title=title,
text=text,
image_urls=[],
)
recipe = Recipe("newsface", "newsface.db")
recipe.add_beaker("url", ArticleURL)
recipe.add_beaker("response", HttpResponse)
recipe.add_beaker("article", Article)
recipe.add_transform("url", "response", HttpRequest)
recipe.add_conditional(
"response",
is_npr,
"npr_article",
)
recipe.add_transform(
"npr_article",
"article",
extract_npr_article,
)
npr_examples = [
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
]
other = [
ArticleURL(url="https://nytimes.com", source="nytimes"),
]
# recipe.add_seed(
# "article_url",
# npr_examples + other,
# )

View File

@ -0,0 +1 @@
from .recipe import Recipe

9
src/beakers/filters.py Normal file
View File

@ -0,0 +1,9 @@
class ConditionalFilter:
def __init__(self, condition):
self.condition = condition
def __call__(self, item):
if self.condition(item):
return item
else:
return None

31
src/beakers/http.py Normal file
View File

@ -0,0 +1,31 @@
import httpx
import pydantic
import datetime
class HttpResponse(pydantic.BaseModel):
url: str
status_code: int
response_body: str
retrieved_at: datetime.datetime = pydantic.Field(
default_factory=datetime.datetime.now
)
class HttpRequest:
def __init__(self, beaker: str, field: str):
self.beaker = beaker
self.field = field
async def __call__(self, item) -> HttpResponse:
bkr = getattr(item, self.beaker)
url = getattr(bkr, self.field)
async with httpx.AsyncClient() as client:
response = await client.get(url)
return HttpResponse(
url=url,
status_code=response.status_code,
response_body=response.text,
)

View File

@ -48,9 +48,9 @@ class Recipe:
def __repr__(self) -> str:
return f"Recipe({self.name})"
def add_beaker(self, name: str, temp: bool = False) -> Beaker:
self.graph.add_node(name)
if temp:
def add_beaker(self, name: str, datatype: type | None) -> Beaker:
self.graph.add_node(name, datatype=datatype)
if datatype is None:
self.beakers[name] = TempBeaker(name, self)
else:
self.beakers[name] = SqliteBeaker(name, self)
@ -85,14 +85,14 @@ class Recipe:
from_beaker: str,
condition_func: callable,
if_true: str,
if_false: str,
if_false: str = "",
) -> None:
# first add a transform to evaluate the conditional
if condition_func.__name__ == "<lambda>":
cond_name = f"cond-{from_beaker}"
else:
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
self.add_beaker(cond_name, temp=True)
self.add_beaker(cond_name, None)
self.add_transform(
from_beaker,
cond_name,
@ -101,16 +101,18 @@ class Recipe:
)
# then add two filtered paths that remove the condition result
self.add_beaker(if_true, None)
self.add_transform(
cond_name,
if_true,
if_cond_true,
)
self.add_transform(
cond_name,
if_false,
if_cond_false,
)
if if_false:
self.add_transform(
cond_name,
if_false,
if_cond_false,
)
def get_metadata(self, table_name) -> dict:
cursor = self.db.cursor()