convert to beakers 0.1
This commit is contained in:
parent
b357a6d1d5
commit
c510a4d5cc
@ -1,71 +0,0 @@
|
|||||||
import datetime
|
|
||||||
from pydantic import BaseModel
|
|
||||||
import lxml
|
|
||||||
from beakers import Recipe
|
|
||||||
from beakers.http import HttpRequest
|
|
||||||
|
|
||||||
|
|
||||||
class ArticleURL(BaseModel):
|
|
||||||
url: str
|
|
||||||
source: str
|
|
||||||
|
|
||||||
|
|
||||||
class HttpResponse(BaseModel):
|
|
||||||
url: str
|
|
||||||
status: int
|
|
||||||
content: str
|
|
||||||
retrieved_at: datetime.datetime
|
|
||||||
|
|
||||||
|
|
||||||
class Article(BaseModel):
|
|
||||||
title: str
|
|
||||||
text: str
|
|
||||||
image_urls: list[str]
|
|
||||||
|
|
||||||
|
|
||||||
def is_npr(item) -> bool:
|
|
||||||
return item.url.source == "npr"
|
|
||||||
|
|
||||||
|
|
||||||
def extract_npr_article(item) -> Article:
|
|
||||||
doc = lxml.html.fromstring(item.response.content)
|
|
||||||
title = doc.cssselect(".story-title")[0].text()
|
|
||||||
text = doc.cssselect(".paragraphs-container").text()
|
|
||||||
return Article(
|
|
||||||
title=title,
|
|
||||||
text=text,
|
|
||||||
image_urls=[],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
recipe = Recipe("newsface", "newsface.db")
|
|
||||||
recipe.add_beaker("url", ArticleURL)
|
|
||||||
recipe.add_beaker("response", HttpResponse)
|
|
||||||
recipe.add_beaker("article", Article)
|
|
||||||
recipe.add_transform("url", "response", HttpRequest)
|
|
||||||
recipe.add_conditional(
|
|
||||||
"response",
|
|
||||||
is_npr,
|
|
||||||
"npr_article",
|
|
||||||
)
|
|
||||||
recipe.add_transform(
|
|
||||||
"npr_article",
|
|
||||||
"article",
|
|
||||||
extract_npr_article,
|
|
||||||
)
|
|
||||||
recipe.add_transform("archived_article")
|
|
||||||
|
|
||||||
|
|
||||||
npr_examples = [
|
|
||||||
ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
|
|
||||||
ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
|
|
||||||
ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
|
|
||||||
]
|
|
||||||
other = [
|
|
||||||
ArticleURL(url="https://nytimes.com", source="nytimes"),
|
|
||||||
]
|
|
||||||
|
|
||||||
recipe.add_seed(
|
|
||||||
"url",
|
|
||||||
npr_examples + other,
|
|
||||||
)
|
|
@ -1,83 +0,0 @@
|
|||||||
from ssl import SSLCertVerificationError, SSLError
|
|
||||||
import httpx
|
|
||||||
import tiktoken
|
|
||||||
import lxml.html
|
|
||||||
from lxml.etree import ParserError
|
|
||||||
from beakers.beakers import Beaker
|
|
||||||
from beakers.recipe import Recipe
|
|
||||||
from scrapeghost import SchemaScraper
|
|
||||||
from scrapeghost.preprocessors import CleanHTML
|
|
||||||
|
|
||||||
|
|
||||||
async def add_response(obj_with_url):
|
|
||||||
url = obj_with_url["url"]
|
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
response = await client.get(url)
|
|
||||||
return {
|
|
||||||
"url": url,
|
|
||||||
"status_code": response.status_code,
|
|
||||||
"response_body": response.text,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def tiktoken_count(response):
|
|
||||||
if response["status_code"] != 200:
|
|
||||||
raise ValueError("response status code is not 200")
|
|
||||||
|
|
||||||
html = response["response_body"]
|
|
||||||
|
|
||||||
# clean the html
|
|
||||||
cleaner = CleanHTML()
|
|
||||||
encoding = tiktoken.get_encoding("cl100k_base")
|
|
||||||
doc = lxml.html.fromstring(html)
|
|
||||||
(doc,) = cleaner(doc) # returns a 1-item list
|
|
||||||
html_again = lxml.html.tostring(doc, encoding="unicode")
|
|
||||||
tokens = len(encoding.encode(html_again))
|
|
||||||
|
|
||||||
response["tiktoken_count"] = tokens
|
|
||||||
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
# current thinking, beakers exist within a recipe
|
|
||||||
recipe = Recipe("fetch urls", "url_example.db")
|
|
||||||
recipe.add_beaker("agencies")
|
|
||||||
recipe.add_beaker("responses")
|
|
||||||
recipe.add_beaker("bad_requests")
|
|
||||||
recipe.add_beaker("good_urls", temp=True)
|
|
||||||
recipe.add_beaker("missing_urls", temp=True)
|
|
||||||
recipe.add_beaker("with_tiktoken_count")
|
|
||||||
recipe.add_beaker("no_tiktoken_count", temp=True)
|
|
||||||
recipe.add_beaker("token_lt_8k", temp=True)
|
|
||||||
recipe.add_beaker("token_gt_8k", temp=True)
|
|
||||||
|
|
||||||
recipe.add_conditional(
|
|
||||||
"agencies",
|
|
||||||
lambda x: x["url"].startswith("http"),
|
|
||||||
if_true="good_urls",
|
|
||||||
if_false="missing_urls",
|
|
||||||
)
|
|
||||||
recipe.add_transform(
|
|
||||||
"good_urls",
|
|
||||||
"responses",
|
|
||||||
add_response,
|
|
||||||
error_map={
|
|
||||||
(
|
|
||||||
httpx.HTTPError,
|
|
||||||
SSLCertVerificationError,
|
|
||||||
SSLError,
|
|
||||||
): "bad_requests"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
recipe.add_transform(
|
|
||||||
"responses",
|
|
||||||
"with_tiktoken_count",
|
|
||||||
tiktoken_count,
|
|
||||||
error_map={(ValueError, ParserError): "no_tiktoken_count"},
|
|
||||||
)
|
|
||||||
recipe.add_conditional(
|
|
||||||
"with_tiktoken_count",
|
|
||||||
lambda x: x["tiktoken_count"] < 8000,
|
|
||||||
if_true="token_lt_8k",
|
|
||||||
if_false="token_gt_8k",
|
|
||||||
)
|
|
82
foiaghost.py
82
foiaghost.py
@ -1,82 +0,0 @@
|
|||||||
schema = {
|
|
||||||
"public_records_email": "email",
|
|
||||||
"public_records_address": "str",
|
|
||||||
"public_records_phone": "555-555-5555",
|
|
||||||
"public_records_fax": "555-555-5555",
|
|
||||||
"public_records_web": "url",
|
|
||||||
"general_contact_phone": "555-555-5555",
|
|
||||||
"general_contact_address": "str",
|
|
||||||
"foia_guide": "url",
|
|
||||||
"public_reading_room": "url",
|
|
||||||
"agency_logo": "url",
|
|
||||||
}
|
|
||||||
extra_instructions = """
|
|
||||||
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
|
|
||||||
The fields that begin with general_contact should refer to contact information for the agency in general.
|
|
||||||
If a field is not found in the HTML, leave it as null in the JSON.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# create a scraper w/ a sqlite cache
|
|
||||||
scraper = Scraper(requests_per_minute=600)
|
|
||||||
scraper.cache_storage = SQLiteCache("cache.sqlite")
|
|
||||||
|
|
||||||
# create a scrapeghost
|
|
||||||
ghost = SchemaScraper(
|
|
||||||
schema=schema,
|
|
||||||
extra_preprocessors=[],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
agencies = []
|
|
||||||
|
|
||||||
|
|
||||||
async def fetch_urls(urls):
|
|
||||||
async with httpx.AsyncClient() as client:
|
|
||||||
tasks = [client.get(url) for url in urls]
|
|
||||||
responses = await asyncio.gather(*tasks)
|
|
||||||
return responses
|
|
||||||
|
|
||||||
|
|
||||||
async def worker(queue, batch_size):
|
|
||||||
with open("results.csv", "w") as outf:
|
|
||||||
out = csv.DictWriter(
|
|
||||||
outf, fieldnames=["id", "url", "status"] + list(schema.keys())
|
|
||||||
)
|
|
||||||
while True:
|
|
||||||
urls = []
|
|
||||||
for _ in range(batch_size):
|
|
||||||
try:
|
|
||||||
url = await queue.get()
|
|
||||||
urls.append(url)
|
|
||||||
except asyncio.QueueEmpty:
|
|
||||||
break
|
|
||||||
if len(urls) > 0:
|
|
||||||
responses = await fetch_urls(urls, batch_size)
|
|
||||||
async yield responses
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
batch_size = 5
|
|
||||||
|
|
||||||
with open("agencies.csv", "r") as inf,
|
|
||||||
agencies = csv.DictReader(inf)
|
|
||||||
# grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
out.writerow(
|
|
||||||
{
|
|
||||||
"id": agency["id"],
|
|
||||||
"url": agency["url"],
|
|
||||||
"status": "ERROR",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
result = ghost.scrape(page.text)
|
|
||||||
out.writerow(
|
|
||||||
result
|
|
||||||
+ {"id": agency["id"], "url": agency["url"], "status": "OK"}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
82
old.py
Normal file
82
old.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
# schema = {
|
||||||
|
# "public_records_email": "email",
|
||||||
|
# "public_records_address": "str",
|
||||||
|
# "public_records_phone": "555-555-5555",
|
||||||
|
# "public_records_fax": "555-555-5555",
|
||||||
|
# "public_records_web": "url",
|
||||||
|
# "general_contact_phone": "555-555-5555",
|
||||||
|
# "general_contact_address": "str",
|
||||||
|
# "foia_guide": "url",
|
||||||
|
# "public_reading_room": "url",
|
||||||
|
# "agency_logo": "url",
|
||||||
|
# }
|
||||||
|
# extra_instructions = """
|
||||||
|
# The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
|
||||||
|
# The fields that begin with general_contact should refer to contact information for the agency in general.
|
||||||
|
# If a field is not found in the HTML, leave it as null in the JSON.
|
||||||
|
# """
|
||||||
|
|
||||||
|
# # create a scraper w/ a sqlite cache
|
||||||
|
# scraper = Scraper(requests_per_minute=600)
|
||||||
|
# scraper.cache_storage = SQLiteCache("cache.sqlite")
|
||||||
|
|
||||||
|
# # create a scrapeghost
|
||||||
|
# ghost = SchemaScraper(
|
||||||
|
# schema=schema,
|
||||||
|
# extra_preprocessors=[],
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
# agencies = []
|
||||||
|
|
||||||
|
|
||||||
|
# async def fetch_urls(urls):
|
||||||
|
# async with httpx.AsyncClient() as client:
|
||||||
|
# tasks = [client.get(url) for url in urls]
|
||||||
|
# responses = await asyncio.gather(*tasks)
|
||||||
|
# return responses
|
||||||
|
|
||||||
|
|
||||||
|
# async def worker(queue, batch_size):
|
||||||
|
# with open("results.csv", "w") as outf:
|
||||||
|
# out = csv.DictWriter(
|
||||||
|
# outf, fieldnames=["id", "url", "status"] + list(schema.keys())
|
||||||
|
# )
|
||||||
|
# while True:
|
||||||
|
# urls = []
|
||||||
|
# for _ in range(batch_size):
|
||||||
|
# try:
|
||||||
|
# url = await queue.get()
|
||||||
|
# urls.append(url)
|
||||||
|
# except asyncio.QueueEmpty:
|
||||||
|
# break
|
||||||
|
# if len(urls) > 0:
|
||||||
|
# responses = await fetch_urls(urls, batch_size)
|
||||||
|
# async yield responses
|
||||||
|
|
||||||
|
|
||||||
|
# async def main():
|
||||||
|
# batch_size = 5
|
||||||
|
|
||||||
|
# with open("agencies.csv", "r") as inf,
|
||||||
|
# agencies = csv.DictReader(inf)
|
||||||
|
# # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
|
||||||
|
# except Exception as e:
|
||||||
|
# print(e)
|
||||||
|
# out.writerow(
|
||||||
|
# {
|
||||||
|
# "id": agency["id"],
|
||||||
|
# "url": agency["url"],
|
||||||
|
# "status": "ERROR",
|
||||||
|
# }
|
||||||
|
# )
|
||||||
|
# continue
|
||||||
|
# result = ghost.scrape(page.text)
|
||||||
|
# out.writerow(
|
||||||
|
# result
|
||||||
|
# + {"id": agency["id"], "url": agency["url"], "status": "OK"}
|
||||||
|
# )
|
||||||
|
|
||||||
|
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# main()
|
0
src/foiaghost/__init__.py
Normal file
0
src/foiaghost/__init__.py
Normal file
15
src/foiaghost/models.py
Normal file
15
src/foiaghost/models.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class Agency(BaseModel):
|
||||||
|
id: str
|
||||||
|
url: str
|
||||||
|
name: str
|
||||||
|
|
||||||
|
|
||||||
|
class URL(BaseModel):
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
class Int(BaseModel):
|
||||||
|
int: int
|
86
src/foiaghost/pipeline.py
Normal file
86
src/foiaghost/pipeline.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
from ssl import SSLCertVerificationError, SSLError
|
||||||
|
import httpx
|
||||||
|
import tiktoken
|
||||||
|
import lxml.html
|
||||||
|
from lxml.etree import ParserError
|
||||||
|
from databeakers import Pipeline
|
||||||
|
from databeakers.http import HttpRequest, HttpResponse
|
||||||
|
from scrapeghost import SchemaScraper
|
||||||
|
from scrapeghost.preprocessors import CleanHTML
|
||||||
|
from .models import Agency, URL, Int
|
||||||
|
import csv
|
||||||
|
|
||||||
|
|
||||||
|
class CSVSource:
|
||||||
|
def __init__(self, filename, datatype):
|
||||||
|
self.filename = filename
|
||||||
|
self.datatype = datatype
|
||||||
|
|
||||||
|
def __call__(self):
|
||||||
|
with open(self.filename) as inf:
|
||||||
|
reader = csv.DictReader(inf)
|
||||||
|
for line in reader:
|
||||||
|
yield self.datatype(**line)
|
||||||
|
|
||||||
|
|
||||||
|
def tiktoken_count(response):
|
||||||
|
if response["status_code"] != 200:
|
||||||
|
raise ValueError("response status code is not 200")
|
||||||
|
|
||||||
|
html = response["response_body"]
|
||||||
|
|
||||||
|
# clean the html
|
||||||
|
cleaner = CleanHTML()
|
||||||
|
encoding = tiktoken.get_encoding("cl100k_base")
|
||||||
|
doc = lxml.html.fromstring(html)
|
||||||
|
(doc,) = cleaner(doc) # returns a 1-item list
|
||||||
|
html_again = lxml.html.tostring(doc, encoding="unicode")
|
||||||
|
tokens = len(encoding.encode(html_again))
|
||||||
|
|
||||||
|
response["tiktoken_count"] = tokens
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
# current thinking, beakers exist within a recipe
|
||||||
|
recipe = Pipeline("foiaghost", "foiaghost.db")
|
||||||
|
recipe.add_beaker("agency", Agency)
|
||||||
|
recipe.add_beaker("good_urls", URL)
|
||||||
|
recipe.add_transform("agency", "good_urls", lambda x: x["url"].startswith("http"))
|
||||||
|
recipe.add_beaker("responses", HttpResponse)
|
||||||
|
recipe.add_transform("good_urls", "responses", HttpRequest)
|
||||||
|
recipe.add_beaker("tiktoken_count", Int)
|
||||||
|
recipe.add_transform(
|
||||||
|
"responses",
|
||||||
|
"tiktoken_count",
|
||||||
|
tiktoken_count,
|
||||||
|
error_map={(ValueError, ParserError): "no_tiktoken_count"},
|
||||||
|
)
|
||||||
|
recipe.add_seed(
|
||||||
|
"agencies",
|
||||||
|
"agency",
|
||||||
|
CSVSource("agencies.csv", Agency),
|
||||||
|
)
|
||||||
|
|
||||||
|
# recipe.add_beaker("token_lt_8k", temp=True)
|
||||||
|
# recipe.add_beaker("token_gt_8k", temp=True)
|
||||||
|
|
||||||
|
|
||||||
|
# recipe.add_transform(
|
||||||
|
# "good_urls",
|
||||||
|
# "responses",
|
||||||
|
# add_response,
|
||||||
|
# error_map={
|
||||||
|
# (
|
||||||
|
# httpx.HTTPError,
|
||||||
|
# SSLCertVerificationError,
|
||||||
|
# SSLError,
|
||||||
|
# ): "bad_requests"
|
||||||
|
# },
|
||||||
|
# )
|
||||||
|
# recipe.add_conditional(
|
||||||
|
# "with_tiktoken_count",
|
||||||
|
# lambda x: x["tiktoken_count"] < 8000,
|
||||||
|
# if_true="token_lt_8k",
|
||||||
|
# if_false="token_gt_8k",
|
||||||
|
# )
|
Loading…
Reference in New Issue
Block a user