foiaghost tiktoken counts

This commit is contained in:
James Turk 2023-05-18 18:40:55 -05:00
parent 4c1392abff
commit 20f9809689
2 changed files with 42 additions and 9 deletions

View File

@ -1,7 +1,12 @@
from ssl import SSLCertVerificationError, SSLError
import httpx
import tiktoken
import lxml.html
from lxml.etree import ParserError
from beakers.beakers import Beaker
from beakers.recipe import Recipe
from scrapeghost import SchemaScraper
from scrapeghost.preprocessors import CleanHTML
async def add_response(obj_with_url):
@ -16,6 +21,25 @@ async def add_response(obj_with_url):
}
def tiktoken_count(response):
if response["status_code"] != 200:
raise ValueError("response status code is not 200")
html = response["response_body"]
# clean the html
cleaner = CleanHTML()
encoding = tiktoken.get_encoding("cl100k_base")
doc = lxml.html.fromstring(html)
(doc,) = cleaner(doc) # returns a 1-item list
html_again = lxml.html.tostring(doc, encoding="unicode")
tokens = len(encoding.encode(html_again))
response["tiktoken_count"] = tokens
return response
# current thinking, beakers exist within a recipe
recipe = Recipe("fetch urls", "url_example.db")
recipe.add_beaker("agencies")
@ -23,6 +47,11 @@ recipe.add_beaker("responses")
recipe.add_beaker("bad_requests")
recipe.add_beaker("good_urls", temp=True)
recipe.add_beaker("missing_urls", temp=True)
recipe.add_beaker("with_tiktoken_count")
recipe.add_beaker("no_tiktoken_count", temp=True)
recipe.add_beaker("token_lt_8k", temp=True)
recipe.add_beaker("token_gt_8k", temp=True)
recipe.add_conditional(
"agencies",
lambda x: x["url"].startswith("http"),
@ -41,3 +70,15 @@ recipe.add_transform(
): "bad_requests"
},
)
recipe.add_transform(
"responses",
"with_tiktoken_count",
tiktoken_count,
error_map={(ValueError, ParserError): "no_tiktoken_count"},
)
recipe.add_conditional(
"with_tiktoken_count",
lambda x: x["tiktoken_count"] < 8000,
if_true="token_lt_8k",
if_false="token_gt_8k",
)

View File

@ -1,11 +1,3 @@
import asyncio
import httpx
import csv
from asyncio import Queue
from itertools import zip_longest
from scrapelib import Scraper, SQLiteCache
from scrapeghost import SchemaScraper, CSS
schema = {
"public_records_email": "email",
"public_records_address": "str",
@ -19,7 +11,7 @@ schema = {
"agency_logo": "url",
}
extra_instructions = """
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
The fields that begin with general_contact should refer to contact information for the agency in general.
If a field is not found in the HTML, leave it as null in the JSON.
"""