working on article example
This commit is contained in:
parent
046c9e453b
commit
11003ef872
71
examples/articles.py
Normal file
71
examples/articles.py
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
import datetime
|
||||||
|
from pydantic import BaseModel
|
||||||
|
import lxml
|
||||||
|
from beakers import Recipe
|
||||||
|
from beakers.filters import ConditionalFilter
|
||||||
|
from beakers.http import HttpRequest
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleURL(BaseModel):
|
||||||
|
url: str
|
||||||
|
source: str
|
||||||
|
|
||||||
|
|
||||||
|
class HttpResponse(BaseModel):
|
||||||
|
url: str
|
||||||
|
status: int
|
||||||
|
content: str
|
||||||
|
retrieved_at: datetime.datetime
|
||||||
|
|
||||||
|
|
||||||
|
class Article(BaseModel):
|
||||||
|
title: str
|
||||||
|
text: str
|
||||||
|
image_urls: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
def is_npr(item) -> bool:
|
||||||
|
return item.url.source == "npr"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_npr_article(item) -> Article:
|
||||||
|
doc = lxml.html.fromstring(item.response.content)
|
||||||
|
title = doc.cssselect(".story-title")[0].text()
|
||||||
|
text = doc.cssselect(".paragraphs-container").text()
|
||||||
|
return Article(
|
||||||
|
title=title,
|
||||||
|
text=text,
|
||||||
|
image_urls=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
recipe = Recipe("newsface", "newsface.db")
|
||||||
|
recipe.add_beaker("url", ArticleURL)
|
||||||
|
recipe.add_beaker("response", HttpResponse)
|
||||||
|
recipe.add_beaker("article", Article)
|
||||||
|
recipe.add_transform("url", "response", HttpRequest)
|
||||||
|
recipe.add_conditional(
|
||||||
|
"response",
|
||||||
|
is_npr,
|
||||||
|
"npr_article",
|
||||||
|
)
|
||||||
|
recipe.add_transform(
|
||||||
|
"npr_article",
|
||||||
|
"article",
|
||||||
|
extract_npr_article,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
npr_examples = [
|
||||||
|
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
|
||||||
|
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
|
||||||
|
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
|
||||||
|
]
|
||||||
|
other = [
|
||||||
|
ArticleURL(url="https://nytimes.com", source="nytimes"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# recipe.add_seed(
|
||||||
|
# "article_url",
|
||||||
|
# npr_examples + other,
|
||||||
|
# )
|
@ -0,0 +1 @@
|
|||||||
|
from .recipe import Recipe
|
9
src/beakers/filters.py
Normal file
9
src/beakers/filters.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
class ConditionalFilter:
|
||||||
|
def __init__(self, condition):
|
||||||
|
self.condition = condition
|
||||||
|
|
||||||
|
def __call__(self, item):
|
||||||
|
if self.condition(item):
|
||||||
|
return item
|
||||||
|
else:
|
||||||
|
return None
|
31
src/beakers/http.py
Normal file
31
src/beakers/http.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import httpx
|
||||||
|
import pydantic
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
|
class HttpResponse(pydantic.BaseModel):
|
||||||
|
url: str
|
||||||
|
status_code: int
|
||||||
|
response_body: str
|
||||||
|
retrieved_at: datetime.datetime = pydantic.Field(
|
||||||
|
default_factory=datetime.datetime.now
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HttpRequest:
|
||||||
|
def __init__(self, beaker: str, field: str):
|
||||||
|
self.beaker = beaker
|
||||||
|
self.field = field
|
||||||
|
|
||||||
|
async def __call__(self, item) -> HttpResponse:
|
||||||
|
bkr = getattr(item, self.beaker)
|
||||||
|
url = getattr(bkr, self.field)
|
||||||
|
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
response = await client.get(url)
|
||||||
|
|
||||||
|
return HttpResponse(
|
||||||
|
url=url,
|
||||||
|
status_code=response.status_code,
|
||||||
|
response_body=response.text,
|
||||||
|
)
|
@ -48,9 +48,9 @@ class Recipe:
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"Recipe({self.name})"
|
return f"Recipe({self.name})"
|
||||||
|
|
||||||
def add_beaker(self, name: str, temp: bool = False) -> Beaker:
|
def add_beaker(self, name: str, datatype: type | None) -> Beaker:
|
||||||
self.graph.add_node(name)
|
self.graph.add_node(name, datatype=datatype)
|
||||||
if temp:
|
if datatype is None:
|
||||||
self.beakers[name] = TempBeaker(name, self)
|
self.beakers[name] = TempBeaker(name, self)
|
||||||
else:
|
else:
|
||||||
self.beakers[name] = SqliteBeaker(name, self)
|
self.beakers[name] = SqliteBeaker(name, self)
|
||||||
@ -85,14 +85,14 @@ class Recipe:
|
|||||||
from_beaker: str,
|
from_beaker: str,
|
||||||
condition_func: callable,
|
condition_func: callable,
|
||||||
if_true: str,
|
if_true: str,
|
||||||
if_false: str,
|
if_false: str = "",
|
||||||
) -> None:
|
) -> None:
|
||||||
# first add a transform to evaluate the conditional
|
# first add a transform to evaluate the conditional
|
||||||
if condition_func.__name__ == "<lambda>":
|
if condition_func.__name__ == "<lambda>":
|
||||||
cond_name = f"cond-{from_beaker}"
|
cond_name = f"cond-{from_beaker}"
|
||||||
else:
|
else:
|
||||||
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
|
cond_name = f"cond-{from_beaker}-{condition_func.__name__}"
|
||||||
self.add_beaker(cond_name, temp=True)
|
self.add_beaker(cond_name, None)
|
||||||
self.add_transform(
|
self.add_transform(
|
||||||
from_beaker,
|
from_beaker,
|
||||||
cond_name,
|
cond_name,
|
||||||
@ -101,16 +101,18 @@ class Recipe:
|
|||||||
)
|
)
|
||||||
|
|
||||||
# then add two filtered paths that remove the condition result
|
# then add two filtered paths that remove the condition result
|
||||||
|
self.add_beaker(if_true, None)
|
||||||
self.add_transform(
|
self.add_transform(
|
||||||
cond_name,
|
cond_name,
|
||||||
if_true,
|
if_true,
|
||||||
if_cond_true,
|
if_cond_true,
|
||||||
)
|
)
|
||||||
self.add_transform(
|
if if_false:
|
||||||
cond_name,
|
self.add_transform(
|
||||||
if_false,
|
cond_name,
|
||||||
if_cond_false,
|
if_false,
|
||||||
)
|
if_cond_false,
|
||||||
|
)
|
||||||
|
|
||||||
def get_metadata(self, table_name) -> dict:
|
def get_metadata(self, table_name) -> dict:
|
||||||
cursor = self.db.cursor()
|
cursor = self.db.cursor()
|
||||||
|
Loading…
Reference in New Issue
Block a user