working on article example
This commit is contained in:
parent
046c9e453b
commit
11003ef872
71
examples/articles.py
Normal file
71
examples/articles.py
Normal file
@ -0,0 +1,71 @@
|
||||
import datetime
|
||||
from pydantic import BaseModel
|
||||
import lxml
|
||||
from beakers import Recipe
|
||||
from beakers.filters import ConditionalFilter
|
||||
from beakers.http import HttpRequest
|
||||
|
||||
|
||||
class ArticleURL(BaseModel):
|
||||
url: str
|
||||
source: str
|
||||
|
||||
|
||||
class HttpResponse(BaseModel):
|
||||
url: str
|
||||
status: int
|
||||
content: str
|
||||
retrieved_at: datetime.datetime
|
||||
|
||||
|
||||
class Article(BaseModel):
|
||||
title: str
|
||||
text: str
|
||||
image_urls: list[str]
|
||||
|
||||
|
||||
def is_npr(item) -> bool:
|
||||
return item.url.source == "npr"
|
||||
|
||||
|
||||
def extract_npr_article(item) -> Article:
|
||||
doc = lxml.html.fromstring(item.response.content)
|
||||
title = doc.cssselect(".story-title")[0].text()
|
||||
text = doc.cssselect(".paragraphs-container").text()
|
||||
return Article(
|
||||
title=title,
|
||||
text=text,
|
||||
image_urls=[],
|
||||
)
|
||||
|
||||
|
||||
recipe = Recipe("newsface", "newsface.db")
|
||||
recipe.add_beaker("url", ArticleURL)
|
||||
recipe.add_beaker("response", HttpResponse)
|
||||
recipe.add_beaker("article", Article)
|
||||
recipe.add_transform("url", "response", HttpRequest)
|
||||
recipe.add_conditional(
|
||||
"response",
|
||||
is_npr,
|
||||
"npr_article",
|
||||
)
|
||||
recipe.add_transform(
|
||||
"npr_article",
|
||||
"article",
|
||||
extract_npr_article,
|
||||
)
|
||||
|
||||
|
||||
npr_examples = [
|
||||
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
|
||||
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
|
||||
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
|
||||
]
|
||||
other = [
|
||||
ArticleURL(url="https://nytimes.com", source="nytimes"),
|
||||
]
|
||||
|
||||
# recipe.add_seed(
|
||||
# "article_url",
|
||||
# npr_examples + other,
|
||||
# )
|
@ -0,0 +1 @@
|
||||
from .recipe import Recipe
|
9
src/beakers/filters.py
Normal file
9
src/beakers/filters.py
Normal file
@ -0,0 +1,9 @@
|
||||
class ConditionalFilter:
|
||||
def __init__(self, condition):
|
||||
self.condition = condition
|
||||
|
||||
def __call__(self, item):
|
||||
if self.condition(item):
|
||||
return item
|
||||
else:
|
||||
return None
|
31
src/beakers/http.py
Normal file
31
src/beakers/http.py
Normal file
@ -0,0 +1,31 @@
|
||||
import httpx
|
||||
import pydantic
|
||||
import datetime
|
||||
|
||||
|
||||
class HttpResponse(pydantic.BaseModel):
|
||||
url: str
|
||||
status_code: int
|
||||
response_body: str
|
||||
retrieved_at: datetime.datetime = pydantic.Field(
|
||||
default_factory=datetime.datetime.now
|
||||
)
|
||||
|
||||
|
||||
class HttpRequest:
|
||||
def __init__(self, beaker: str, field: str):
|
||||
self.beaker = beaker
|
||||
self.field = field
|
||||
|
||||
async def __call__(self, item) -> HttpResponse:
|
||||
bkr = getattr(item, self.beaker)
|
||||
url = getattr(bkr, self.field)
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(url)
|
||||
|
||||
return HttpResponse(
|
||||
url=url,
|
||||
status_code=response.status_code,
|
||||
response_body=response.text,
|
||||
)
|
@ -48,9 +48,9 @@ class Recipe:
|
||||
def __repr__(self) -> str:
|
||||
return f"Recipe({self.name})"
|
||||
|
||||
def add_beaker(self, name: str, temp: bool = False) -> Beaker:
|
||||
self.graph.add_node(name)
|
||||
if temp:
|
||||
def add_beaker(self, name: str, datatype: type | None) -> Beaker:
|
||||
self.graph.add_node(name, datatype=datatype)
|
||||
if datatype is None:
|
||||
self.beakers[name] = TempBeaker(name, self)
|
||||
else:
|
||||
self.beakers[name] = SqliteBeaker(name, self)
|
||||
@ -85,14 +85,14 @@ class Recipe:
|
||||
from_beaker: str,
|
||||
condition_func: callable,
|
||||
if_true: str,
|
||||
if_false: str,
|
||||
if_false: str = "",
|
||||
) -> None:
|
||||
# first add a transform to evaluate the conditional
|
||||
if condition_func.__name__ == "<lambda>":
|
||||
cond_name = f"cond-{from_beaker}"
|
||||
else:
|
||||
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
|
||||
self.add_beaker(cond_name, temp=True)
|
||||
self.add_beaker(cond_name, None)
|
||||
self.add_transform(
|
||||
from_beaker,
|
||||
cond_name,
|
||||
@ -101,16 +101,18 @@ class Recipe:
|
||||
)
|
||||
|
||||
# then add two filtered paths that remove the condition result
|
||||
self.add_beaker(if_true, None)
|
||||
self.add_transform(
|
||||
cond_name,
|
||||
if_true,
|
||||
if_cond_true,
|
||||
)
|
||||
self.add_transform(
|
||||
cond_name,
|
||||
if_false,
|
||||
if_cond_false,
|
||||
)
|
||||
if if_false:
|
||||
self.add_transform(
|
||||
cond_name,
|
||||
if_false,
|
||||
if_cond_false,
|
||||
)
|
||||
|
||||
def get_metadata(self, table_name) -> dict:
|
||||
cursor = self.db.cursor()
|
||||
|
Loading…
Reference in New Issue
Block a user