foiaghost/examples/foiaghost.py

85 lines
2.2 KiB
Python
Raw Normal View History

2023-05-08 06:05:26 +00:00
from ssl import SSLCertVerificationError, SSLError
2023-05-07 23:39:46 +00:00
import httpx
2023-05-18 23:40:55 +00:00
import tiktoken
import lxml.html
from lxml.etree import ParserError
2023-05-08 01:06:28 +00:00
from beakers.beakers import Beaker
from beakers.recipe import Recipe
2023-05-18 23:40:55 +00:00
from scrapeghost import SchemaScraper
from scrapeghost.preprocessors import CleanHTML
2023-04-27 06:25:07 +00:00
async def add_response(obj_with_url):
2023-05-07 23:39:46 +00:00
print(obj_with_url["url"])
2023-04-27 06:25:07 +00:00
url = obj_with_url["url"]
2023-05-08 04:48:19 +00:00
async with httpx.AsyncClient() as client:
response = await client.get(url)
2023-04-27 06:25:07 +00:00
return {
"url": url,
"status_code": response.status_code,
"response_body": response.text,
}
2023-05-18 23:40:55 +00:00
def tiktoken_count(response):
if response["status_code"] != 200:
raise ValueError("response status code is not 200")
html = response["response_body"]
# clean the html
cleaner = CleanHTML()
encoding = tiktoken.get_encoding("cl100k_base")
doc = lxml.html.fromstring(html)
(doc,) = cleaner(doc) # returns a 1-item list
html_again = lxml.html.tostring(doc, encoding="unicode")
tokens = len(encoding.encode(html_again))
response["tiktoken_count"] = tokens
return response
2023-05-07 23:39:46 +00:00
# current thinking, beakers exist within a recipe
2023-05-08 03:55:02 +00:00
recipe = Recipe("fetch urls", "url_example.db")
2023-05-08 00:17:28 +00:00
recipe.add_beaker("agencies")
recipe.add_beaker("responses")
2023-05-08 03:55:02 +00:00
recipe.add_beaker("bad_requests")
2023-05-08 00:17:28 +00:00
recipe.add_beaker("good_urls", temp=True)
recipe.add_beaker("missing_urls", temp=True)
2023-05-18 23:40:55 +00:00
recipe.add_beaker("with_tiktoken_count")
recipe.add_beaker("no_tiktoken_count", temp=True)
recipe.add_beaker("token_lt_8k", temp=True)
recipe.add_beaker("token_gt_8k", temp=True)
2023-05-08 00:17:28 +00:00
recipe.add_conditional(
2023-05-07 23:39:46 +00:00
"agencies",
lambda x: x["url"].startswith("http"),
if_true="good_urls",
if_false="missing_urls",
)
2023-05-08 03:55:02 +00:00
recipe.add_transform(
2023-05-08 04:48:19 +00:00
"good_urls",
"responses",
add_response,
2023-05-08 06:05:26 +00:00
error_map={
(
2023-05-08 06:11:07 +00:00
httpx.HTTPError,
2023-05-08 06:05:26 +00:00
SSLCertVerificationError,
SSLError,
): "bad_requests"
},
2023-05-08 03:55:02 +00:00
)
2023-05-18 23:40:55 +00:00
recipe.add_transform(
"responses",
"with_tiktoken_count",
tiktoken_count,
error_map={(ValueError, ParserError): "no_tiktoken_count"},
)
recipe.add_conditional(
"with_tiktoken_count",
lambda x: x["tiktoken_count"] < 8000,
if_true="token_lt_8k",
if_false="token_gt_8k",
)