foiaghost tiktoken counts

2023-05-18 18:40:55 -05:00 · 2023-05-18 18:40:55 -05:00 · 20f9809689
commit 20f9809689
parent 4c1392abff
2 changed files with 42 additions and 9 deletions
--- a/examples/foiaghost.py
+++ b/examples/foiaghost.py
@ -1,7 +1,12 @@
 from ssl import SSLCertVerificationError, SSLError
 import httpx
+import tiktoken
+import lxml.html
+from lxml.etree import ParserError
 from beakers.beakers import Beaker
 from beakers.recipe import Recipe
+from scrapeghost import SchemaScraper
+from scrapeghost.preprocessors import CleanHTML


 async def add_response(obj_with_url):
@ -16,6 +21,25 @@ async def add_response(obj_with_url):
    }


+def tiktoken_count(response):
+    if response["status_code"] != 200:
+        raise ValueError("response status code is not 200")
+
+    html = response["response_body"]
+
+    # clean the html
+    cleaner = CleanHTML()
+    encoding = tiktoken.get_encoding("cl100k_base")
+    doc = lxml.html.fromstring(html)
+    (doc,) = cleaner(doc)  # returns a 1-item list
+    html_again = lxml.html.tostring(doc, encoding="unicode")
+    tokens = len(encoding.encode(html_again))
+
+    response["tiktoken_count"] = tokens
+
+    return response
+
+
 # current thinking, beakers exist within a recipe
 recipe = Recipe("fetch urls", "url_example.db")
 recipe.add_beaker("agencies")
@ -23,6 +47,11 @@ recipe.add_beaker("responses")
 recipe.add_beaker("bad_requests")
 recipe.add_beaker("good_urls", temp=True)
 recipe.add_beaker("missing_urls", temp=True)
+recipe.add_beaker("with_tiktoken_count")
+recipe.add_beaker("no_tiktoken_count", temp=True)
+recipe.add_beaker("token_lt_8k", temp=True)
+recipe.add_beaker("token_gt_8k", temp=True)
+
 recipe.add_conditional(
    "agencies",
    lambda x: x["url"].startswith("http"),
@ -41,3 +70,15 @@ recipe.add_transform(
        ): "bad_requests"
    },
 )
+recipe.add_transform(
+    "responses",
+    "with_tiktoken_count",
+    tiktoken_count,
+    error_map={(ValueError, ParserError): "no_tiktoken_count"},
+)
+recipe.add_conditional(
+    "with_tiktoken_count",
+    lambda x: x["tiktoken_count"] < 8000,
+    if_true="token_lt_8k",
+    if_false="token_gt_8k",
+)
--- a/foiaghost.py
+++ b/foiaghost.py
@ -1,11 +1,3 @@
-import asyncio
-import httpx
-import csv
-from asyncio import Queue
-from itertools import zip_longest
-from scrapelib import Scraper, SQLiteCache
-from scrapeghost import SchemaScraper, CSS
-
 schema = {
    "public_records_email": "email",
    "public_records_address": "str",
@ -19,7 +11,7 @@ schema = {
    "agency_logo": "url",
 }
 extra_instructions = """
-The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
+The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
 The fields that begin with general_contact should refer to contact information for the agency in general.
 If a field is not found in the HTML, leave it as null in the JSON.
 """