foiaghost/examples/articles.py

72 lines
1.5 KiB
Python
Raw Normal View History

2023-07-11 05:03:24 +00:00
import datetime
from pydantic import BaseModel
import lxml
from beakers import Recipe
from beakers.http import HttpRequest
class ArticleURL(BaseModel):
url: str
source: str
class HttpResponse(BaseModel):
url: str
status: int
content: str
retrieved_at: datetime.datetime
class Article(BaseModel):
title: str
text: str
image_urls: list[str]
def is_npr(item) -> bool:
return item.url.source == "npr"
def extract_npr_article(item) -> Article:
doc = lxml.html.fromstring(item.response.content)
title = doc.cssselect(".story-title")[0].text()
text = doc.cssselect(".paragraphs-container").text()
return Article(
title=title,
text=text,
image_urls=[],
)
recipe = Recipe("newsface", "newsface.db")
recipe.add_beaker("url", ArticleURL)
recipe.add_beaker("response", HttpResponse)
recipe.add_beaker("article", Article)
recipe.add_transform("url", "response", HttpRequest)
recipe.add_conditional(
"response",
is_npr,
"npr_article",
)
recipe.add_transform(
"npr_article",
"article",
extract_npr_article,
)
2023-07-11 23:49:23 +00:00
recipe.add_transform("archived_article")
2023-07-11 05:03:24 +00:00
npr_examples = [
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
]
other = [
ArticleURL(url="https://nytimes.com", source="nytimes"),
]
2023-07-11 17:41:19 +00:00
recipe.add_seed(
"url",
npr_examples + other,
)