foiaghost/examples/foiaghost.py

from ssl import SSLCertVerificationError, SSLError
import httpx
import tiktoken
import lxml.html
from lxml.etree import ParserError
from beakers.beakers import Beaker
from beakers.recipe import Recipe
from scrapeghost import SchemaScraper
from scrapeghost.preprocessors import CleanHTML


async def add_response(obj_with_url):
    print(obj_with_url["url"])
    url = obj_with_url["url"]
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
    return {
        "url": url,
        "status_code": response.status_code,
        "response_body": response.text,
    }


def tiktoken_count(response):
    if response["status_code"] != 200:
        raise ValueError("response status code is not 200")

    html = response["response_body"]

    # clean the html
    cleaner = CleanHTML()
    encoding = tiktoken.get_encoding("cl100k_base")
    doc = lxml.html.fromstring(html)
    (doc,) = cleaner(doc)  # returns a 1-item list
    html_again = lxml.html.tostring(doc, encoding="unicode")
    tokens = len(encoding.encode(html_again))

    response["tiktoken_count"] = tokens

    return response


# current thinking, beakers exist within a recipe
recipe = Recipe("fetch urls", "url_example.db")
recipe.add_beaker("agencies")
recipe.add_beaker("responses")
recipe.add_beaker("bad_requests")
recipe.add_beaker("good_urls", temp=True)
recipe.add_beaker("missing_urls", temp=True)
recipe.add_beaker("with_tiktoken_count")
recipe.add_beaker("no_tiktoken_count", temp=True)
recipe.add_beaker("token_lt_8k", temp=True)
recipe.add_beaker("token_gt_8k", temp=True)

recipe.add_conditional(
    "agencies",
    lambda x: x["url"].startswith("http"),
    if_true="good_urls",
    if_false="missing_urls",
)
recipe.add_transform(
    "good_urls",
    "responses",
    add_response,
    error_map={
        (
            httpx.HTTPError,
            SSLCertVerificationError,
            SSLError,
        ): "bad_requests"
    },
)
recipe.add_transform(
    "responses",
    "with_tiktoken_count",
    tiktoken_count,
    error_map={(ValueError, ParserError): "no_tiktoken_count"},
)
recipe.add_conditional(
    "with_tiktoken_count",
    lambda x: x["tiktoken_count"] < 8000,
    if_true="token_lt_8k",
    if_false="token_gt_8k",
)
foiaghost incremental and ssl error 2023-05-08 06:05:26 +00:00			`from ssl import SSLCertVerificationError, SSLError`
WIP graph recipe 2023-05-07 23:39:46 +00:00			`import httpx`
foiaghost tiktoken counts 2023-05-18 23:40:55 +00:00			`import tiktoken`
			`import lxml.html`
			`from lxml.etree import ParserError`
sqlite pushing 2023-05-08 01:06:28 +00:00			`from beakers.beakers import Beaker`
			`from beakers.recipe import Recipe`
foiaghost tiktoken counts 2023-05-18 23:40:55 +00:00			`from scrapeghost import SchemaScraper`
			`from scrapeghost.preprocessors import CleanHTML`
foiaghost, but actually beakers 2023-04-27 06:25:07 +00:00

			`async def add_response(obj_with_url):`
WIP graph recipe 2023-05-07 23:39:46 +00:00			`print(obj_with_url["url"])`
foiaghost, but actually beakers 2023-04-27 06:25:07 +00:00			`url = obj_with_url["url"]`
fixes for error handling and foiaghost 2023-05-08 04:48:19 +00:00			`async with httpx.AsyncClient() as client:`
			`response = await client.get(url)`
foiaghost, but actually beakers 2023-04-27 06:25:07 +00:00			`return {`
			`"url": url,`
			`"status_code": response.status_code,`
			`"response_body": response.text,`
			`}`


foiaghost tiktoken counts 2023-05-18 23:40:55 +00:00			`def tiktoken_count(response):`
			`if response["status_code"] != 200:`
			`raise ValueError("response status code is not 200")`

			`html = response["response_body"]`

			`# clean the html`
			`cleaner = CleanHTML()`
			`encoding = tiktoken.get_encoding("cl100k_base")`
			`doc = lxml.html.fromstring(html)`
			`(doc,) = cleaner(doc) # returns a 1-item list`
			`html_again = lxml.html.tostring(doc, encoding="unicode")`
			`tokens = len(encoding.encode(html_again))`

			`response["tiktoken_count"] = tokens`

			`return response`


WIP graph recipe 2023-05-07 23:39:46 +00:00			`# current thinking, beakers exist within a recipe`
getting more real 2023-05-08 03:55:02 +00:00			`recipe = Recipe("fetch urls", "url_example.db")`
networkx version 2023-05-08 00:17:28 +00:00			`recipe.add_beaker("agencies")`
			`recipe.add_beaker("responses")`
getting more real 2023-05-08 03:55:02 +00:00			`recipe.add_beaker("bad_requests")`
networkx version 2023-05-08 00:17:28 +00:00			`recipe.add_beaker("good_urls", temp=True)`
			`recipe.add_beaker("missing_urls", temp=True)`
foiaghost tiktoken counts 2023-05-18 23:40:55 +00:00			`recipe.add_beaker("with_tiktoken_count")`
			`recipe.add_beaker("no_tiktoken_count", temp=True)`
			`recipe.add_beaker("token_lt_8k", temp=True)`
			`recipe.add_beaker("token_gt_8k", temp=True)`

networkx version 2023-05-08 00:17:28 +00:00			`recipe.add_conditional(`
WIP graph recipe 2023-05-07 23:39:46 +00:00			`"agencies",`
			`lambda x: x["url"].startswith("http"),`
			`if_true="good_urls",`
			`if_false="missing_urls",`
			`)`
getting more real 2023-05-08 03:55:02 +00:00			`recipe.add_transform(`
fixes for error handling and foiaghost 2023-05-08 04:48:19 +00:00			`"good_urls",`
			`"responses",`
			`add_response,`
foiaghost incremental and ssl error 2023-05-08 06:05:26 +00:00			`error_map={`
			`(`
fix HTTPerror catch all 2023-05-08 06:11:07 +00:00			`httpx.HTTPError,`
foiaghost incremental and ssl error 2023-05-08 06:05:26 +00:00			`SSLCertVerificationError,`
			`SSLError,`
			`): "bad_requests"`
			`},`
getting more real 2023-05-08 03:55:02 +00:00			`)`
foiaghost tiktoken counts 2023-05-18 23:40:55 +00:00			`recipe.add_transform(`
			`"responses",`
			`"with_tiktoken_count",`
			`tiktoken_count,`
			`error_map={(ValueError, ParserError): "no_tiktoken_count"},`
			`)`
			`recipe.add_conditional(`
			`"with_tiktoken_count",`
			`lambda x: x["tiktoken_count"] < 8000,`
			`if_true="token_lt_8k",`
			`if_false="token_gt_8k",`
			`)`