foiaghost/examples/foiaghost.py
2023-05-18 18:40:55 -05:00

85 lines
2.2 KiB
Python

from ssl import SSLCertVerificationError, SSLError
import httpx
import tiktoken
import lxml.html
from lxml.etree import ParserError
from beakers.beakers import Beaker
from beakers.recipe import Recipe
from scrapeghost import SchemaScraper
from scrapeghost.preprocessors import CleanHTML
async def add_response(obj_with_url):
print(obj_with_url["url"])
url = obj_with_url["url"]
async with httpx.AsyncClient() as client:
response = await client.get(url)
return {
"url": url,
"status_code": response.status_code,
"response_body": response.text,
}
def tiktoken_count(response):
if response["status_code"] != 200:
raise ValueError("response status code is not 200")
html = response["response_body"]
# clean the html
cleaner = CleanHTML()
encoding = tiktoken.get_encoding("cl100k_base")
doc = lxml.html.fromstring(html)
(doc,) = cleaner(doc) # returns a 1-item list
html_again = lxml.html.tostring(doc, encoding="unicode")
tokens = len(encoding.encode(html_again))
response["tiktoken_count"] = tokens
return response
# current thinking, beakers exist within a recipe
recipe = Recipe("fetch urls", "url_example.db")
recipe.add_beaker("agencies")
recipe.add_beaker("responses")
recipe.add_beaker("bad_requests")
recipe.add_beaker("good_urls", temp=True)
recipe.add_beaker("missing_urls", temp=True)
recipe.add_beaker("with_tiktoken_count")
recipe.add_beaker("no_tiktoken_count", temp=True)
recipe.add_beaker("token_lt_8k", temp=True)
recipe.add_beaker("token_gt_8k", temp=True)
recipe.add_conditional(
"agencies",
lambda x: x["url"].startswith("http"),
if_true="good_urls",
if_false="missing_urls",
)
recipe.add_transform(
"good_urls",
"responses",
add_response,
error_map={
(
httpx.HTTPError,
SSLCertVerificationError,
SSLError,
): "bad_requests"
},
)
recipe.add_transform(
"responses",
"with_tiktoken_count",
tiktoken_count,
error_map={(ValueError, ParserError): "no_tiktoken_count"},
)
recipe.add_conditional(
"with_tiktoken_count",
lambda x: x["tiktoken_count"] < 8000,
if_true="token_lt_8k",
if_false="token_gt_8k",
)