from ssl import SSLCertVerificationError, SSLError import httpx import tiktoken import lxml.html from lxml.etree import ParserError from beakers.beakers import Beaker from beakers.recipe import Recipe from scrapeghost import SchemaScraper from scrapeghost.preprocessors import CleanHTML async def add_response(obj_with_url): print(obj_with_url["url"]) url = obj_with_url["url"] async with httpx.AsyncClient() as client: response = await client.get(url) return { "url": url, "status_code": response.status_code, "response_body": response.text, } def tiktoken_count(response): if response["status_code"] != 200: raise ValueError("response status code is not 200") html = response["response_body"] # clean the html cleaner = CleanHTML() encoding = tiktoken.get_encoding("cl100k_base") doc = lxml.html.fromstring(html) (doc,) = cleaner(doc) # returns a 1-item list html_again = lxml.html.tostring(doc, encoding="unicode") tokens = len(encoding.encode(html_again)) response["tiktoken_count"] = tokens return response # current thinking, beakers exist within a recipe recipe = Recipe("fetch urls", "url_example.db") recipe.add_beaker("agencies") recipe.add_beaker("responses") recipe.add_beaker("bad_requests") recipe.add_beaker("good_urls", temp=True) recipe.add_beaker("missing_urls", temp=True) recipe.add_beaker("with_tiktoken_count") recipe.add_beaker("no_tiktoken_count", temp=True) recipe.add_beaker("token_lt_8k", temp=True) recipe.add_beaker("token_gt_8k", temp=True) recipe.add_conditional( "agencies", lambda x: x["url"].startswith("http"), if_true="good_urls", if_false="missing_urls", ) recipe.add_transform( "good_urls", "responses", add_response, error_map={ ( httpx.HTTPError, SSLCertVerificationError, SSLError, ): "bad_requests" }, ) recipe.add_transform( "responses", "with_tiktoken_count", tiktoken_count, error_map={(ValueError, ParserError): "no_tiktoken_count"}, ) recipe.add_conditional( "with_tiktoken_count", lambda x: x["tiktoken_count"] < 8000, if_true="token_lt_8k", if_false="token_gt_8k", )