foiaghost tiktoken counts
This commit is contained in:
parent
4c1392abff
commit
20f9809689
@ -1,7 +1,12 @@
|
|||||||
from ssl import SSLCertVerificationError, SSLError
|
from ssl import SSLCertVerificationError, SSLError
|
||||||
import httpx
|
import httpx
|
||||||
|
import tiktoken
|
||||||
|
import lxml.html
|
||||||
|
from lxml.etree import ParserError
|
||||||
from beakers.beakers import Beaker
|
from beakers.beakers import Beaker
|
||||||
from beakers.recipe import Recipe
|
from beakers.recipe import Recipe
|
||||||
|
from scrapeghost import SchemaScraper
|
||||||
|
from scrapeghost.preprocessors import CleanHTML
|
||||||
|
|
||||||
|
|
||||||
async def add_response(obj_with_url):
|
async def add_response(obj_with_url):
|
||||||
@ -16,6 +21,25 @@ async def add_response(obj_with_url):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def tiktoken_count(response):
|
||||||
|
if response["status_code"] != 200:
|
||||||
|
raise ValueError("response status code is not 200")
|
||||||
|
|
||||||
|
html = response["response_body"]
|
||||||
|
|
||||||
|
# clean the html
|
||||||
|
cleaner = CleanHTML()
|
||||||
|
encoding = tiktoken.get_encoding("cl100k_base")
|
||||||
|
doc = lxml.html.fromstring(html)
|
||||||
|
(doc,) = cleaner(doc) # returns a 1-item list
|
||||||
|
html_again = lxml.html.tostring(doc, encoding="unicode")
|
||||||
|
tokens = len(encoding.encode(html_again))
|
||||||
|
|
||||||
|
response["tiktoken_count"] = tokens
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
# current thinking, beakers exist within a recipe
|
# current thinking, beakers exist within a recipe
|
||||||
recipe = Recipe("fetch urls", "url_example.db")
|
recipe = Recipe("fetch urls", "url_example.db")
|
||||||
recipe.add_beaker("agencies")
|
recipe.add_beaker("agencies")
|
||||||
@ -23,6 +47,11 @@ recipe.add_beaker("responses")
|
|||||||
recipe.add_beaker("bad_requests")
|
recipe.add_beaker("bad_requests")
|
||||||
recipe.add_beaker("good_urls", temp=True)
|
recipe.add_beaker("good_urls", temp=True)
|
||||||
recipe.add_beaker("missing_urls", temp=True)
|
recipe.add_beaker("missing_urls", temp=True)
|
||||||
|
recipe.add_beaker("with_tiktoken_count")
|
||||||
|
recipe.add_beaker("no_tiktoken_count", temp=True)
|
||||||
|
recipe.add_beaker("token_lt_8k", temp=True)
|
||||||
|
recipe.add_beaker("token_gt_8k", temp=True)
|
||||||
|
|
||||||
recipe.add_conditional(
|
recipe.add_conditional(
|
||||||
"agencies",
|
"agencies",
|
||||||
lambda x: x["url"].startswith("http"),
|
lambda x: x["url"].startswith("http"),
|
||||||
@ -41,3 +70,15 @@ recipe.add_transform(
|
|||||||
): "bad_requests"
|
): "bad_requests"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
recipe.add_transform(
|
||||||
|
"responses",
|
||||||
|
"with_tiktoken_count",
|
||||||
|
tiktoken_count,
|
||||||
|
error_map={(ValueError, ParserError): "no_tiktoken_count"},
|
||||||
|
)
|
||||||
|
recipe.add_conditional(
|
||||||
|
"with_tiktoken_count",
|
||||||
|
lambda x: x["tiktoken_count"] < 8000,
|
||||||
|
if_true="token_lt_8k",
|
||||||
|
if_false="token_gt_8k",
|
||||||
|
)
|
||||||
|
10
foiaghost.py
10
foiaghost.py
@ -1,11 +1,3 @@
|
|||||||
import asyncio
|
|
||||||
import httpx
|
|
||||||
import csv
|
|
||||||
from asyncio import Queue
|
|
||||||
from itertools import zip_longest
|
|
||||||
from scrapelib import Scraper, SQLiteCache
|
|
||||||
from scrapeghost import SchemaScraper, CSS
|
|
||||||
|
|
||||||
schema = {
|
schema = {
|
||||||
"public_records_email": "email",
|
"public_records_email": "email",
|
||||||
"public_records_address": "str",
|
"public_records_address": "str",
|
||||||
@ -19,7 +11,7 @@ schema = {
|
|||||||
"agency_logo": "url",
|
"agency_logo": "url",
|
||||||
}
|
}
|
||||||
extra_instructions = """
|
extra_instructions = """
|
||||||
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
|
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
|
||||||
The fields that begin with general_contact should refer to contact information for the agency in general.
|
The fields that begin with general_contact should refer to contact information for the agency in general.
|
||||||
If a field is not found in the HTML, leave it as null in the JSON.
|
If a field is not found in the HTML, leave it as null in the JSON.
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user