import datetime from pydantic import BaseModel import lxml from beakers import Recipe from beakers.http import HttpRequest class ArticleURL(BaseModel): url: str source: str class HttpResponse(BaseModel): url: str status: int content: str retrieved_at: datetime.datetime class Article(BaseModel): title: str text: str image_urls: list[str] def is_npr(item) -> bool: return item.url.source == "npr" def extract_npr_article(item) -> Article: doc = lxml.html.fromstring(item.response.content) title = doc.cssselect(".story-title")[0].text() text = doc.cssselect(".paragraphs-container").text() return Article( title=title, text=text, image_urls=[], ) recipe = Recipe("newsface", "newsface.db") recipe.add_beaker("url", ArticleURL) recipe.add_beaker("response", HttpResponse) recipe.add_beaker("article", Article) recipe.add_transform("url", "response", HttpRequest) recipe.add_conditional( "response", is_npr, "npr_article", ) recipe.add_transform( "npr_article", "article", extract_npr_article, ) npr_examples = [ ArticleURL(url="https://text.npr.org/1186770075", source="npr"), ArticleURL(url="https://text.npr.org/1186525577", source="npr"), ArticleURL(url="https://text.npr.org/1185780577", source="npr"), ] other = [ ArticleURL(url="https://nytimes.com", source="nytimes"), ] recipe.add_seed( "url", npr_examples + other, )