foiaghost/examples/articles.py
2023-07-11 18:55:30 -05:00

72 lines
1.5 KiB
Python

import datetime
from pydantic import BaseModel
import lxml
from beakers import Recipe
from beakers.http import HttpRequest
class ArticleURL(BaseModel):
url: str
source: str
class HttpResponse(BaseModel):
url: str
status: int
content: str
retrieved_at: datetime.datetime
class Article(BaseModel):
title: str
text: str
image_urls: list[str]
def is_npr(item) -> bool:
return item.url.source == "npr"
def extract_npr_article(item) -> Article:
doc = lxml.html.fromstring(item.response.content)
title = doc.cssselect(".story-title")[0].text()
text = doc.cssselect(".paragraphs-container").text()
return Article(
title=title,
text=text,
image_urls=[],
)
recipe = Recipe("newsface", "newsface.db")
recipe.add_beaker("url", ArticleURL)
recipe.add_beaker("response", HttpResponse)
recipe.add_beaker("article", Article)
recipe.add_transform("url", "response", HttpRequest)
recipe.add_conditional(
"response",
is_npr,
"npr_article",
)
recipe.add_transform(
"npr_article",
"article",
extract_npr_article,
)
recipe.add_transform("archived_article")
npr_examples = [
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
]
other = [
ArticleURL(url="https://nytimes.com", source="nytimes"),
]
recipe.add_seed(
"url",
npr_examples + other,
)