diff --git a/examples/foiaghost.py b/examples/foiaghost.py index 1e4a496..ba4d823 100644 --- a/examples/foiaghost.py +++ b/examples/foiaghost.py @@ -1,7 +1,12 @@ from ssl import SSLCertVerificationError, SSLError import httpx +import tiktoken +import lxml.html +from lxml.etree import ParserError from beakers.beakers import Beaker from beakers.recipe import Recipe +from scrapeghost import SchemaScraper +from scrapeghost.preprocessors import CleanHTML async def add_response(obj_with_url): @@ -16,6 +21,25 @@ async def add_response(obj_with_url): } +def tiktoken_count(response): + if response["status_code"] != 200: + raise ValueError("response status code is not 200") + + html = response["response_body"] + + # clean the html + cleaner = CleanHTML() + encoding = tiktoken.get_encoding("cl100k_base") + doc = lxml.html.fromstring(html) + (doc,) = cleaner(doc) # returns a 1-item list + html_again = lxml.html.tostring(doc, encoding="unicode") + tokens = len(encoding.encode(html_again)) + + response["tiktoken_count"] = tokens + + return response + + # current thinking, beakers exist within a recipe recipe = Recipe("fetch urls", "url_example.db") recipe.add_beaker("agencies") @@ -23,6 +47,11 @@ recipe.add_beaker("responses") recipe.add_beaker("bad_requests") recipe.add_beaker("good_urls", temp=True) recipe.add_beaker("missing_urls", temp=True) +recipe.add_beaker("with_tiktoken_count") +recipe.add_beaker("no_tiktoken_count", temp=True) +recipe.add_beaker("token_lt_8k", temp=True) +recipe.add_beaker("token_gt_8k", temp=True) + recipe.add_conditional( "agencies", lambda x: x["url"].startswith("http"), @@ -41,3 +70,15 @@ recipe.add_transform( ): "bad_requests" }, ) +recipe.add_transform( + "responses", + "with_tiktoken_count", + tiktoken_count, + error_map={(ValueError, ParserError): "no_tiktoken_count"}, +) +recipe.add_conditional( + "with_tiktoken_count", + lambda x: x["tiktoken_count"] < 8000, + if_true="token_lt_8k", + if_false="token_gt_8k", +) diff --git a/foiaghost.py b/foiaghost.py index 8a0271c..f5243d2 100644 --- a/foiaghost.py +++ b/foiaghost.py @@ -1,11 +1,3 @@ -import asyncio -import httpx -import csv -from asyncio import Queue -from itertools import zip_longest -from scrapelib import Scraper, SQLiteCache -from scrapeghost import SchemaScraper, CSS - schema = { "public_records_email": "email", "public_records_address": "str", @@ -19,7 +11,7 @@ schema = { "agency_logo": "url", } extra_instructions = """ -The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests. +The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests. The fields that begin with general_contact should refer to contact information for the agency in general. If a field is not found in the HTML, leave it as null in the JSON. """