big update in august

This commit is contained in:
James Turk 2023-08-16 18:37:24 -05:00
parent c510a4d5cc
commit 30f0fce57a
3 changed files with 108 additions and 42 deletions

28
src/foiaghost/ghost.py Normal file
View File

@ -0,0 +1,28 @@
from scrapeghost import SchemaScraper
schema = {
"public_records_email": "email",
"public_records_address": "str",
"public_records_phone": "555-555-5555",
"public_records_fax": "555-555-5555",
"public_records_web": "url",
"general_contact_phone": "555-555-5555",
"general_contact_address": "str",
"foia_guide": "url",
"public_reading_room": "url",
"agency_logo": "url",
}
extra_instructions = """
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
The fields that begin with general_contact should refer to contact information for the agency in general.
If a field is not found in the HTML, leave it as null in the JSON.
"""
# create a scrapeghost
ghost = SchemaScraper(
schema=schema,
models=["gpt-3.5-turbo-16k"],
# extra_preprocessors=[],
extra_instructions=extra_instructions,
max_cost=10,
)

View File

@ -13,3 +13,17 @@ class URL(BaseModel):
class Int(BaseModel):
int: int
class IdOnly(BaseModel):
pass
class JSON(BaseModel):
scraped_json: dict | list
class ScrapeghostResponse(BaseModel):
total_cost: float
api_time: float
data: dict

View File

@ -1,14 +1,19 @@
from ssl import SSLCertVerificationError, SSLError
import csv
import httpx
import tiktoken
import lxml.html
import lxml.etree
from lxml.etree import ParserError
from databeakers import Pipeline
from ssl import SSLCertVerificationError, SSLError
from databeakers.pipeline import Pipeline, EdgeType, ErrorType
from databeakers.beakers import TempBeaker
from databeakers.http import HttpRequest, HttpResponse
from scrapeghost import SchemaScraper
from scrapeghost.preprocessors import CleanHTML
from .models import Agency, URL, Int
import csv
from scrapeghost.errors import TooManyTokens
from openai.error import InvalidRequestError
from scrapeghost.errors import BadStop
from .models import Agency, URL, Int, IdOnly, ScrapeghostResponse
from .ghost import ghost
class CSVSource:
@ -24,10 +29,10 @@ class CSVSource:
def tiktoken_count(response):
if response["status_code"] != 200:
if response.status_code != 200:
raise ValueError("response status code is not 200")
html = response["response_body"]
html = response.response_body
# clean the html
cleaner = CleanHTML()
@ -37,50 +42,69 @@ def tiktoken_count(response):
html_again = lxml.html.tostring(doc, encoding="unicode")
tokens = len(encoding.encode(html_again))
response["tiktoken_count"] = tokens
return response
return Int(int=tokens)
# current thinking, beakers exist within a recipe
recipe = Pipeline("foiaghost", "foiaghost.db")
recipe.add_beaker("agency", Agency)
recipe.add_beaker("good_urls", URL)
recipe.add_transform("agency", "good_urls", lambda x: x["url"].startswith("http"))
recipe.add_beaker("responses", HttpResponse)
recipe.add_transform("good_urls", "responses", HttpRequest)
recipe.add_beaker("tiktoken_count", Int)
recipe.add_transform(
"responses",
"tiktoken_count",
tiktoken_count,
error_map={(ValueError, ParserError): "no_tiktoken_count"},
)
recipe.add_seed(
"agencies",
"agency",
CSVSource("agencies.csv", Agency),
)
# recipe.add_beaker("token_lt_8k", temp=True)
# recipe.add_beaker("token_gt_8k", temp=True)
recipe.add_beaker("good_urls", URL)
recipe.add_transform(
"agency",
"good_urls",
lambda a: a.url.startswith("http"),
edge_type=EdgeType.conditional,
)
recipe.add_beaker("responses", HttpResponse)
recipe.add_transform(
"good_urls",
"responses",
HttpRequest(),
error_map={
(
httpx.HTTPError,
SSLCertVerificationError,
SSLError,
): "bad_requests"
},
)
# recipe.add_transform(
# "good_urls",
# "responses",
# add_response,
# error_map={
# (
# httpx.HTTPError,
# SSLCertVerificationError,
# SSLError,
# ): "bad_requests"
# },
# )
# recipe.add_conditional(
# "with_tiktoken_count",
# lambda x: x["tiktoken_count"] < 8000,
# if_true="token_lt_8k",
# if_false="token_gt_8k",
# )
class ProcessRecord:
def __init__(self, func, params_map):
self.func = func
self.params_map = params_map
def __call__(self, record):
kwargs = {}
for param, (beaker_name, field_name) in self.params_map.items():
kwargs[param] = getattr(record[beaker_name], field_name)
return self.func(**kwargs)
def __repr__(self):
return f"ProcessRecord({self.func.__name__}, {self.params_map})"
def scrapeghost_response(record) -> ScrapeghostResponse:
sg = ghost.scrape(url_or_html=record["responses"].response_body)
return ScrapeghostResponse(
total_cost=sg.total_cost, api_time=sg.api_time, data=sg.data
)
recipe.add_transform(
"responses",
"scrapeghost_response",
scrapeghost_response,
whole_record=True,
error_map={
(TooManyTokens,): "scrapeghost_too_many_tokens",
(BadStop,): "scrapeghost_bad_stop",
(InvalidRequestError, ValueError, ParserError): "scrapeghost_invalid_request",
},
)