foiaghost/examples/articles.py

import datetime
from pydantic import BaseModel
import lxml
from beakers import Recipe
from beakers.http import HttpRequest


class ArticleURL(BaseModel):
    url: str
    source: str


class HttpResponse(BaseModel):
    url: str
    status: int
    content: str
    retrieved_at: datetime.datetime


class Article(BaseModel):
    title: str
    text: str
    image_urls: list[str]


def is_npr(item) -> bool:
    return item.url.source == "npr"


def extract_npr_article(item) -> Article:
    doc = lxml.html.fromstring(item.response.content)
    title = doc.cssselect(".story-title")[0].text()
    text = doc.cssselect(".paragraphs-container").text()
    return Article(
        title=title,
        text=text,
        image_urls=[],
    )


recipe = Recipe("newsface", "newsface.db")
recipe.add_beaker("url", ArticleURL)
recipe.add_beaker("response", HttpResponse)
recipe.add_beaker("article", Article)
recipe.add_transform("url", "response", HttpRequest)
recipe.add_conditional(
    "response",
    is_npr,
    "npr_article",
)
recipe.add_transform(
    "npr_article",
    "article",
    extract_npr_article,
)
recipe.add_transform("archived_article")


npr_examples = [
    ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
    ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
    ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
]
other = [
    ArticleURL(url="https://nytimes.com", source="nytimes"),
]

recipe.add_seed(
    "url",
    npr_examples + other,
)