2023-07-11 05:03:24 +00:00
|
|
|
import datetime
|
|
|
|
from pydantic import BaseModel
|
|
|
|
import lxml
|
|
|
|
from beakers import Recipe
|
|
|
|
from beakers.http import HttpRequest
|
|
|
|
|
|
|
|
|
|
|
|
class ArticleURL(BaseModel):
|
|
|
|
url: str
|
|
|
|
source: str
|
|
|
|
|
|
|
|
|
|
|
|
class HttpResponse(BaseModel):
|
|
|
|
url: str
|
|
|
|
status: int
|
|
|
|
content: str
|
|
|
|
retrieved_at: datetime.datetime
|
|
|
|
|
|
|
|
|
|
|
|
class Article(BaseModel):
|
|
|
|
title: str
|
|
|
|
text: str
|
|
|
|
image_urls: list[str]
|
|
|
|
|
|
|
|
|
|
|
|
def is_npr(item) -> bool:
|
|
|
|
return item.url.source == "npr"
|
|
|
|
|
|
|
|
|
|
|
|
def extract_npr_article(item) -> Article:
|
|
|
|
doc = lxml.html.fromstring(item.response.content)
|
|
|
|
title = doc.cssselect(".story-title")[0].text()
|
|
|
|
text = doc.cssselect(".paragraphs-container").text()
|
|
|
|
return Article(
|
|
|
|
title=title,
|
|
|
|
text=text,
|
|
|
|
image_urls=[],
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
recipe = Recipe("newsface", "newsface.db")
|
|
|
|
recipe.add_beaker("url", ArticleURL)
|
|
|
|
recipe.add_beaker("response", HttpResponse)
|
|
|
|
recipe.add_beaker("article", Article)
|
|
|
|
recipe.add_transform("url", "response", HttpRequest)
|
|
|
|
recipe.add_conditional(
|
|
|
|
"response",
|
|
|
|
is_npr,
|
|
|
|
"npr_article",
|
|
|
|
)
|
|
|
|
recipe.add_transform(
|
|
|
|
"npr_article",
|
|
|
|
"article",
|
|
|
|
extract_npr_article,
|
|
|
|
)
|
2023-07-11 23:49:23 +00:00
|
|
|
recipe.add_transform("archived_article")
|
2023-07-11 05:03:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
npr_examples = [
|
|
|
|
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
|
|
|
|
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
|
|
|
|
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
|
|
|
|
]
|
|
|
|
other = [
|
|
|
|
ArticleURL(url="https://nytimes.com", source="nytimes"),
|
|
|
|
]
|
|
|
|
|
2023-07-11 17:41:19 +00:00
|
|
|
recipe.add_seed(
|
|
|
|
"url",
|
|
|
|
npr_examples + other,
|
|
|
|
)
|