2023-05-08 06:05:26 +00:00
|
|
|
from ssl import SSLCertVerificationError, SSLError
|
2023-05-07 23:39:46 +00:00
|
|
|
import httpx
|
2023-05-18 23:40:55 +00:00
|
|
|
import tiktoken
|
|
|
|
import lxml.html
|
|
|
|
from lxml.etree import ParserError
|
2023-05-08 01:06:28 +00:00
|
|
|
from beakers.beakers import Beaker
|
|
|
|
from beakers.recipe import Recipe
|
2023-05-18 23:40:55 +00:00
|
|
|
from scrapeghost import SchemaScraper
|
|
|
|
from scrapeghost.preprocessors import CleanHTML
|
2023-04-27 06:25:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
async def add_response(obj_with_url):
|
2023-05-07 23:39:46 +00:00
|
|
|
print(obj_with_url["url"])
|
2023-04-27 06:25:07 +00:00
|
|
|
url = obj_with_url["url"]
|
2023-05-08 04:48:19 +00:00
|
|
|
async with httpx.AsyncClient() as client:
|
|
|
|
response = await client.get(url)
|
2023-04-27 06:25:07 +00:00
|
|
|
return {
|
|
|
|
"url": url,
|
|
|
|
"status_code": response.status_code,
|
|
|
|
"response_body": response.text,
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2023-05-18 23:40:55 +00:00
|
|
|
def tiktoken_count(response):
|
|
|
|
if response["status_code"] != 200:
|
|
|
|
raise ValueError("response status code is not 200")
|
|
|
|
|
|
|
|
html = response["response_body"]
|
|
|
|
|
|
|
|
# clean the html
|
|
|
|
cleaner = CleanHTML()
|
|
|
|
encoding = tiktoken.get_encoding("cl100k_base")
|
|
|
|
doc = lxml.html.fromstring(html)
|
|
|
|
(doc,) = cleaner(doc) # returns a 1-item list
|
|
|
|
html_again = lxml.html.tostring(doc, encoding="unicode")
|
|
|
|
tokens = len(encoding.encode(html_again))
|
|
|
|
|
|
|
|
response["tiktoken_count"] = tokens
|
|
|
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
2023-05-07 23:39:46 +00:00
|
|
|
# current thinking, beakers exist within a recipe
|
2023-05-08 03:55:02 +00:00
|
|
|
recipe = Recipe("fetch urls", "url_example.db")
|
2023-05-08 00:17:28 +00:00
|
|
|
recipe.add_beaker("agencies")
|
|
|
|
recipe.add_beaker("responses")
|
2023-05-08 03:55:02 +00:00
|
|
|
recipe.add_beaker("bad_requests")
|
2023-05-08 00:17:28 +00:00
|
|
|
recipe.add_beaker("good_urls", temp=True)
|
|
|
|
recipe.add_beaker("missing_urls", temp=True)
|
2023-05-18 23:40:55 +00:00
|
|
|
recipe.add_beaker("with_tiktoken_count")
|
|
|
|
recipe.add_beaker("no_tiktoken_count", temp=True)
|
|
|
|
recipe.add_beaker("token_lt_8k", temp=True)
|
|
|
|
recipe.add_beaker("token_gt_8k", temp=True)
|
|
|
|
|
2023-05-08 00:17:28 +00:00
|
|
|
recipe.add_conditional(
|
2023-05-07 23:39:46 +00:00
|
|
|
"agencies",
|
|
|
|
lambda x: x["url"].startswith("http"),
|
|
|
|
if_true="good_urls",
|
|
|
|
if_false="missing_urls",
|
|
|
|
)
|
2023-05-08 03:55:02 +00:00
|
|
|
recipe.add_transform(
|
2023-05-08 04:48:19 +00:00
|
|
|
"good_urls",
|
|
|
|
"responses",
|
|
|
|
add_response,
|
2023-05-08 06:05:26 +00:00
|
|
|
error_map={
|
|
|
|
(
|
2023-05-08 06:11:07 +00:00
|
|
|
httpx.HTTPError,
|
2023-05-08 06:05:26 +00:00
|
|
|
SSLCertVerificationError,
|
|
|
|
SSLError,
|
|
|
|
): "bad_requests"
|
|
|
|
},
|
2023-05-08 03:55:02 +00:00
|
|
|
)
|
2023-05-18 23:40:55 +00:00
|
|
|
recipe.add_transform(
|
|
|
|
"responses",
|
|
|
|
"with_tiktoken_count",
|
|
|
|
tiktoken_count,
|
|
|
|
error_map={(ValueError, ParserError): "no_tiktoken_count"},
|
|
|
|
)
|
|
|
|
recipe.add_conditional(
|
|
|
|
"with_tiktoken_count",
|
|
|
|
lambda x: x["tiktoken_count"] < 8000,
|
|
|
|
if_true="token_lt_8k",
|
|
|
|
if_false="token_gt_8k",
|
|
|
|
)
|