convert to beakers 0.1

2023-08-05 14:40:42 -05:00 · 2023-08-05 14:40:42 -05:00 · c510a4d5cc
commit c510a4d5cc
parent b357a6d1d5
7 changed files with 183 additions and 236 deletions
--- a/examples/articles.py
+++ b/examples/articles.py
@ -1,71 +0,0 @@
 import datetime
 from pydantic import BaseModel
 import lxml
 from beakers import Recipe
 from beakers.http import HttpRequest
 class ArticleURL(BaseModel):
    url: str
    source: str
 class HttpResponse(BaseModel):
    url: str
    status: int
    content: str
    retrieved_at: datetime.datetime
 class Article(BaseModel):
    title: str
    text: str
    image_urls: list[str]
 def is_npr(item) -> bool:
    return item.url.source == "npr"
 def extract_npr_article(item) -> Article:
    doc = lxml.html.fromstring(item.response.content)
    title = doc.cssselect(".story-title")[0].text()
    text = doc.cssselect(".paragraphs-container").text()
    return Article(
        title=title,
        text=text,
        image_urls=[],
    )
 recipe = Recipe("newsface", "newsface.db")
 recipe.add_beaker("url", ArticleURL)
 recipe.add_beaker("response", HttpResponse)
 recipe.add_beaker("article", Article)
 recipe.add_transform("url", "response", HttpRequest)
 recipe.add_conditional(
    "response",
    is_npr,
    "npr_article",
 )
 recipe.add_transform(
    "npr_article",
    "article",
    extract_npr_article,
 )
 recipe.add_transform("archived_article")
 npr_examples = [
    ArticleURL(url="https://text.npr.org/1186770075", source="npr"),
    ArticleURL(url="https://text.npr.org/1186525577", source="npr"),
    ArticleURL(url="https://text.npr.org/1185780577", source="npr"),
 ]
 other = [
    ArticleURL(url="https://nytimes.com", source="nytimes"),
 ]
 recipe.add_seed(
    "url",
    npr_examples + other,
 )
--- a/examples/foiaghost.py
+++ b/examples/foiaghost.py
@ -1,83 +0,0 @@
 from ssl import SSLCertVerificationError, SSLError
 import httpx
 import tiktoken
 import lxml.html
 from lxml.etree import ParserError
 from beakers.beakers import Beaker
 from beakers.recipe import Recipe
 from scrapeghost import SchemaScraper
 from scrapeghost.preprocessors import CleanHTML
 async def add_response(obj_with_url):
    url = obj_with_url["url"]
    async with httpx.AsyncClient() as client:
        response = await client.get(url)
    return {
        "url": url,
        "status_code": response.status_code,
        "response_body": response.text,
    }
 def tiktoken_count(response):
    if response["status_code"] != 200:
        raise ValueError("response status code is not 200")
    html = response["response_body"]
    # clean the html
    cleaner = CleanHTML()
    encoding = tiktoken.get_encoding("cl100k_base")
    doc = lxml.html.fromstring(html)
    (doc,) = cleaner(doc)  # returns a 1-item list
    html_again = lxml.html.tostring(doc, encoding="unicode")
    tokens = len(encoding.encode(html_again))
    response["tiktoken_count"] = tokens
    return response
 # current thinking, beakers exist within a recipe
 recipe = Recipe("fetch urls", "url_example.db")
 recipe.add_beaker("agencies")
 recipe.add_beaker("responses")
 recipe.add_beaker("bad_requests")
 recipe.add_beaker("good_urls", temp=True)
 recipe.add_beaker("missing_urls", temp=True)
 recipe.add_beaker("with_tiktoken_count")
 recipe.add_beaker("no_tiktoken_count", temp=True)
 recipe.add_beaker("token_lt_8k", temp=True)
 recipe.add_beaker("token_gt_8k", temp=True)
 recipe.add_conditional(
    "agencies",
    lambda x: x["url"].startswith("http"),
    if_true="good_urls",
    if_false="missing_urls",
 )
 recipe.add_transform(
    "good_urls",
    "responses",
    add_response,
    error_map={
        (
            httpx.HTTPError,
            SSLCertVerificationError,
            SSLError,
        ): "bad_requests"
    },
 )
 recipe.add_transform(
    "responses",
    "with_tiktoken_count",
    tiktoken_count,
    error_map={(ValueError, ParserError): "no_tiktoken_count"},
 )
 recipe.add_conditional(
    "with_tiktoken_count",
    lambda x: x["tiktoken_count"] < 8000,
    if_true="token_lt_8k",
    if_false="token_gt_8k",
 )
--- a/foiaghost.py
+++ b/foiaghost.py
@ -1,82 +0,0 @@
 schema = {
    "public_records_email": "email",
    "public_records_address": "str",
    "public_records_phone": "555-555-5555",
    "public_records_fax": "555-555-5555",
    "public_records_web": "url",
    "general_contact_phone": "555-555-5555",
    "general_contact_address": "str",
    "foia_guide": "url",
    "public_reading_room": "url",
    "agency_logo": "url",
 }
 extra_instructions = """
 The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
 The fields that begin with general_contact should refer to contact information for the agency in general.
 If a field is not found in the HTML, leave it as null in the JSON.
 """
 # create a scraper w/ a sqlite cache
 scraper = Scraper(requests_per_minute=600)
 scraper.cache_storage = SQLiteCache("cache.sqlite")
 # create a scrapeghost
 ghost = SchemaScraper(
    schema=schema,
    extra_preprocessors=[],
 )
 agencies = []
 async def fetch_urls(urls):
    async with httpx.AsyncClient() as client:
        tasks = [client.get(url) for url in urls]
        responses = await asyncio.gather(*tasks)
        return responses
 async def worker(queue, batch_size):
    with open("results.csv", "w") as outf:
        out = csv.DictWriter(
            outf, fieldnames=["id", "url", "status"] + list(schema.keys())
        )
        while True:
            urls = []
            for _ in range(batch_size):
                try:
                    url = await queue.get()
                    urls.append(url)
                except asyncio.QueueEmpty:
                    break
            if len(urls) > 0:
                responses = await fetch_urls(urls, batch_size)
                async yield responses
 async def main():
    batch_size = 5
    with open("agencies.csv", "r") as inf, 
        agencies = csv.DictReader(inf)
        # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
                    except Exception as e:
                        print(e)
                        out.writerow(
                            {
                                "id": agency["id"],
                                "url": agency["url"],
                                "status": "ERROR",
                            }
                        )
                        continue
                    result = ghost.scrape(page.text)
                    out.writerow(
                        result
                        + {"id": agency["id"], "url": agency["url"], "status": "OK"}
                    )
 if __name__ == "__main__":
    main()
--- a/old.py
+++ b/old.py
@ -0,0 +1,82 @@
 # schema = {
 #     "public_records_email": "email",
 #     "public_records_address": "str",
 #     "public_records_phone": "555-555-5555",
 #     "public_records_fax": "555-555-5555",
 #     "public_records_web": "url",
 #     "general_contact_phone": "555-555-5555",
 #     "general_contact_address": "str",
 #     "foia_guide": "url",
 #     "public_reading_room": "url",
 #     "agency_logo": "url",
 # }
 # extra_instructions = """
 # The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
 # The fields that begin with general_contact should refer to contact information for the agency in general.
 # If a field is not found in the HTML, leave it as null in the JSON.
 # """
 # # create a scraper w/ a sqlite cache
 # scraper = Scraper(requests_per_minute=600)
 # scraper.cache_storage = SQLiteCache("cache.sqlite")
 # # create a scrapeghost
 # ghost = SchemaScraper(
 #     schema=schema,
 #     extra_preprocessors=[],
 # )
 # agencies = []
 # async def fetch_urls(urls):
 #     async with httpx.AsyncClient() as client:
 #         tasks = [client.get(url) for url in urls]
 #         responses = await asyncio.gather(*tasks)
 #         return responses
 # async def worker(queue, batch_size):
 #     with open("results.csv", "w") as outf:
 #         out = csv.DictWriter(
 #             outf, fieldnames=["id", "url", "status"] + list(schema.keys())
 #         )
 #         while True:
 #             urls = []
 #             for _ in range(batch_size):
 #                 try:
 #                     url = await queue.get()
 #                     urls.append(url)
 #                 except asyncio.QueueEmpty:
 #                     break
 #             if len(urls) > 0:
 #                 responses = await fetch_urls(urls, batch_size)
 #                 async yield responses
 # async def main():
 #     batch_size = 5
 #     with open("agencies.csv", "r") as inf,
 #         agencies = csv.DictReader(inf)
 #         # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
 #                     except Exception as e:
 #                         print(e)
 #                         out.writerow(
 #                             {
 #                                 "id": agency["id"],
 #                                 "url": agency["url"],
 #                                 "status": "ERROR",
 #                             }
 #                         )
 #                         continue
 #                     result = ghost.scrape(page.text)
 #                     out.writerow(
 #                         result
 #                         + {"id": agency["id"], "url": agency["url"], "status": "OK"}
 #                     )
 # if __name__ == "__main__":
 #     main()
--- a/src/foiaghost/init.py
+++ b/src/foiaghost/init.py
--- a/src/foiaghost/models.py
+++ b/src/foiaghost/models.py
@ -0,0 +1,15 @@
 from pydantic import BaseModel
 class Agency(BaseModel):
    id: str
    url: str
    name: str
 class URL(BaseModel):
    url: str
 class Int(BaseModel):
    int: int
--- a/src/foiaghost/pipeline.py
+++ b/src/foiaghost/pipeline.py
@ -0,0 +1,86 @@
 from ssl import SSLCertVerificationError, SSLError
 import httpx
 import tiktoken
 import lxml.html
 from lxml.etree import ParserError
 from databeakers import Pipeline
 from databeakers.http import HttpRequest, HttpResponse
 from scrapeghost import SchemaScraper
 from scrapeghost.preprocessors import CleanHTML
 from .models import Agency, URL, Int
 import csv
 class CSVSource:
    def __init__(self, filename, datatype):
        self.filename = filename
        self.datatype = datatype
    def __call__(self):
        with open(self.filename) as inf:
            reader = csv.DictReader(inf)
            for line in reader:
                yield self.datatype(**line)
 def tiktoken_count(response):
    if response["status_code"] != 200:
        raise ValueError("response status code is not 200")
    html = response["response_body"]
    # clean the html
    cleaner = CleanHTML()
    encoding = tiktoken.get_encoding("cl100k_base")
    doc = lxml.html.fromstring(html)
    (doc,) = cleaner(doc)  # returns a 1-item list
    html_again = lxml.html.tostring(doc, encoding="unicode")
    tokens = len(encoding.encode(html_again))
    response["tiktoken_count"] = tokens
    return response
 # current thinking, beakers exist within a recipe
 recipe = Pipeline("foiaghost", "foiaghost.db")
 recipe.add_beaker("agency", Agency)
 recipe.add_beaker("good_urls", URL)
 recipe.add_transform("agency", "good_urls", lambda x: x["url"].startswith("http"))
 recipe.add_beaker("responses", HttpResponse)
 recipe.add_transform("good_urls", "responses", HttpRequest)
 recipe.add_beaker("tiktoken_count", Int)
 recipe.add_transform(
    "responses",
    "tiktoken_count",
    tiktoken_count,
    error_map={(ValueError, ParserError): "no_tiktoken_count"},
 )
 recipe.add_seed(
    "agencies",
    "agency",
    CSVSource("agencies.csv", Agency),
 )
 # recipe.add_beaker("token_lt_8k", temp=True)
 # recipe.add_beaker("token_gt_8k", temp=True)
 # recipe.add_transform(
 #     "good_urls",
 #     "responses",
 #     add_response,
 #     error_map={
 #         (
 #             httpx.HTTPError,
 #             SSLCertVerificationError,
 #             SSLError,
 #         ): "bad_requests"
 #     },
 # )
 # recipe.add_conditional(
 #     "with_tiktoken_count",
 #     lambda x: x["tiktoken_count"] < 8000,
 #     if_true="token_lt_8k",
 #     if_false="token_gt_8k",
 # )