foiaghost, but actually beakers

2023-04-27 01:25:07 -05:00 · 2023-04-27 01:25:07 -05:00 · 27c6cb0f8d
commit 27c6cb0f8d
12 changed files with 1397 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,53 @@
 ## Michael's Email
 ### Data Dictionay
 • `website` is supposed to be the homepage of the given agency. As you might know better than most people, that can be a surprisingly imprecise concept (they may only have a Facebook page, someone doing data entry might have picked a site that looked a lot like the official site but isn't really, the data might be out of data.).
 • `url` is supposed to the page dedicated to their public records submissions process or a FOIA portal if it exists. We started breaking out FOIA portals to their own model so those are mostly not included in this list. (One favorite: The official FOIA page of one agency was a PDF at a IP address)
 • This export unfortunately did not include the current actual contacts (email, address, fax) for public records we have on file -- that's something we can pull separately if needed.
 ## Part One
 ### 1A
 3,800 URLs listed for agencies' dedicated FOIA pages in `url`:
 • Not AI, but is this website still good and available?
 • If not, is there a URL it forwards to?
 ### 1B
 {
    "public_records_email": "email",
    "public_records_address": "str",
    "public_records_phone": "555-555-5555",
    "public_records_fax": "555-555-5555",
    "public_records_web": "url",
    "general_contact_phone": "555-555-5555",
    "general_contact_address": "str",
    "foia_guide": "url",
    "public_reading_room": "url",
    "agency_logo": "url"
 }
 The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
 The fields that begin with general_contact should refer to contact information for the agency in general.
 If a field is not found in the HTML, leave it as null in the JSON.
 ### Based on these Questions
 • Does this page list a way for public records requests to submitted via email?
 • ibid, but for mail, fax, web portal for submitting records requests?
 • Is there a phone number listed to reach out with questions about FOIA requests?
 • Is there a general contact phone number listed for this agency?
 • Is there a general address listed for this agency?
 • Is there a link to guide for FOIA or public records requesters?
 • Is there a link to a public reading room or a place to browse documents or data posted by the agencies or view requests submitted by other people?
 • Is there a logo for the agency?
 But honestly, anything you might think interesting to poke at would be useful.
 ## Part Two
 Would also be really interested to see how well it could identify public records and FOIA pages for agencies if given the website, since we have a lot more of those, but having it be able to scrape for potentially updated contacts would be huge.
 Also, in case it's of interest, we've been working to integrate a tool called Klaxon (http://www.newsklaxon.org) into our portfolio and help it scale up. It monitors web pages for changes in a specified HTML element, so you could say just alert me when there's changes in the <body> or just documents added within a specific div. One thing I've been thinking about is setting up Klaxon to look for changes to a specified area of a page, then setting up a secondary scraper that's triggered that can do something more heavy duty as warranted.
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,16 @@
 [tool.poetry]
 name = "foiaghost"
 version = "0.1.0"
 description = ""
 authors = ["James Turk <dev@jamesturk.net>"]
 readme = "README.md"
 [tool.poetry.dependencies]
 python = "^3.11"
 scrapeghost = {path = "../scrapeghost", develop = true}
 scrapelib = "^2.1.0"
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
--- a/src/beakers/init.py
+++ b/src/beakers/init.py
--- a/src/beakers/pycache/init.cpython-311.pyc
+++ b/src/beakers/pycache/init.cpython-311.pyc
--- a/src/beakers/pycache/beaker.cpython-311.pyc
+++ b/src/beakers/pycache/beaker.cpython-311.pyc
--- a/src/beakers/pycache/recipe.cpython-311.pyc
+++ b/src/beakers/pycache/recipe.cpython-311.pyc
--- a/src/beakers/pycache/sources.cpython-311.pyc
+++ b/src/beakers/pycache/sources.cpython-311.pyc
--- a/src/beakers/beaker.py
+++ b/src/beakers/beaker.py
@ -0,0 +1,102 @@
 import sqlite3
 import csv
 import json
 import hashlib
 from structlog import get_logger
 log = get_logger()
 class Beaker:
    def __init__(self, table_name: str, db=None):
        self.table_name = table_name
        self.db = db if db else sqlite3.connect(f"beaker.db")
        self.cursor = self.db.cursor()
        self.cursor.row_factory = sqlite3.Row
        # create table if it doesn't exist
        self._init_metadata()
        self.cursor.execute(
            f"CREATE TABLE IF NOT EXISTS {table_name} (id INTEGER PRIMARY KEY, data JSON, from_table TEXT NULL, from_id INTEGER NULL)"
        )
    def __str__(self):
        return f"Beaker({self.table_name})"
    __repr__ = __str__
    def __iter__(self):
        self.cursor.execute(f"SELECT data FROM {self.table_name}")
        data = self.cursor.fetchall()
        for item in data:
            print(item)
            yield item["id"], item["data"]
    def __len__(self):
        self.cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
        return self.cursor.fetchone()[0]
    def _init_metadata(self):
        self.cursor.execute(
            "CREATE TABLE IF NOT EXISTS _metadata (id INTEGER PRIMARY KEY, table_name TEXT, data JSON)"
        )
        self.cursor.execute(
            "INSERT INTO _metadata (table_name, data) VALUES (?, ?)",
            (self.table_name, json.dumps({})),
        )
        self.db.commit()
    def get_metadata(self) -> dict:
        self.cursor.execute(
            "SELECT data FROM _metadata WHERE table_name = ?",
            (self.table_name,),
        )
        data = self.cursor.fetchone()["data"]
        return json.loads(data)
    def save_metadata(self, data: dict) -> None:
        self.cursor.execute(
            "UPDATE _metadata SET data = ? WHERE table_name = ?",
            (json.dumps(data), self.table_name),
        )
        self.db.commit()
    def add_item(self, item: dict, from_table=None, from_id=None) -> None:
        self.cursor.execute(
            f"INSERT INTO {self.table_name} (data) VALUES (?)", (json.dumps(item),)
        )
        self.db.commit()
    @classmethod
    def from_csv(cls, table_name, filename: str) -> None:
        beaker = cls(table_name)
        lg = log.bind(table_name=table_name, filename=filename)
        # three cases: empty, match, mismatch
        # case 1: empty
        if len(beaker) == 0:
            with open(filename, "r") as file:
                reader = csv.DictReader(file)
                added = 0
                for row in reader:
                    beaker.add_item(row)
                    added += 1
            lg.info("from_csv", case="empty", added=added)
            meta = beaker.get_metadata()
            meta["sha512"] = get_sha512(filename)
            beaker.save_metadata(meta)
        else:
            old_sha = beaker.get_metadata().get("sha512")
            new_sha = get_sha512(filename)
            if old_sha != new_sha:
                # case 3: mismatch
                lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha)
                raise Exception("sha512 mismatch")
            else:
                # case 2: match
                log.info("from_csv", case="match")
        return beaker
 def get_sha512(filename: str) -> str:
    with open(filename, "rb") as file:
        return hashlib.sha512(file.read()).hexdigest()
--- a/src/beakers/recipe.py
+++ b/src/beakers/recipe.py
@ -0,0 +1,41 @@
 from .beaker import Beaker
 from structlog import get_logger
 log = get_logger()
 class Pour:
    def __init__(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
        self.from_beaker = from_beaker
        self.to_beaker = to_beaker
        self.transform = transform
 class Recipe:
    def __init__(self, name: str):
        self.name = name
        self.pours = []
    def __str__(self) -> str:
        return f"Recipe({self.name})"
    __repr__ = __str__
    def add_pour(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
        pour = Pour(from_beaker, to_beaker, transform)
        self.pours.append(pour)
    def run_linearly(self):
        log.info("recipe", recipe=self)
        for pour in self.pours:
            print(pour.from_beaker, pour.to_beaker, pour.transform)
            log.info(
                "pour",
                from_beaker=pour.from_beaker,
                to_beaker=pour.to_beaker,
                to_pour=len(pour.from_beaker),
            )
            for id, item in pour.from_beaker:
                log.info("pour_item", id=id, item=item)
                transformed = pour.transform(item)
                pour.to_beaker.add_item(transformed, pour.from_beaker.table_name, id)
--- a/src/example.py
+++ b/src/example.py
@ -0,0 +1,24 @@
 import csv
 from beakers.recipe import Recipe
 from beakers.beaker import Beaker
 urls = Beaker("urls")
 responses = Beaker("responses")
 async def add_response(obj_with_url):
    url = obj_with_url["url"]
    response = await httpx.get(url)
    return {
        "url": url,
        "status_code": response.status_code,
        "response_body": response.text,
    }
 agencies = Beaker.from_csv("agencies", "agencies.csv")
 responses = Beaker("responses")
 recipe = Recipe("fetch urls")
 recipe.add_pour(agencies, responses, add_response)
 recipe.run_linearly()
--- a/src/foiaghost.py
+++ b/src/foiaghost.py
@ -0,0 +1,90 @@
 import asyncio
 import httpx
 import csv
 from asyncio import Queue
 from itertools import zip_longest
 from scrapelib import Scraper, SQLiteCache
 from scrapeghost import SchemaScraper, CSS
 schema = {
    "public_records_email": "email",
    "public_records_address": "str",
    "public_records_phone": "555-555-5555",
    "public_records_fax": "555-555-5555",
    "public_records_web": "url",
    "general_contact_phone": "555-555-5555",
    "general_contact_address": "str",
    "foia_guide": "url",
    "public_reading_room": "url",
    "agency_logo": "url",
 }
 extra_instructions = """
 The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
 The fields that begin with general_contact should refer to contact information for the agency in general.
 If a field is not found in the HTML, leave it as null in the JSON.
 """
 # create a scraper w/ a sqlite cache
 scraper = Scraper(requests_per_minute=600)
 scraper.cache_storage = SQLiteCache("cache.sqlite")
 # create a scrapeghost
 ghost = SchemaScraper(
    schema=schema,
    extra_preprocessors=[],
 )
 agencies = []
 async def fetch_urls(urls):
    async with httpx.AsyncClient() as client:
        tasks = [client.get(url) for url in urls]
        responses = await asyncio.gather(*tasks)
        return responses
 async def worker(queue, batch_size):
    with open("results.csv", "w") as outf:
        out = csv.DictWriter(
            outf, fieldnames=["id", "url", "status"] + list(schema.keys())
        )
        while True:
            urls = []
            for _ in range(batch_size):
                try:
                    url = await queue.get()
                    urls.append(url)
                except asyncio.QueueEmpty:
                    break
            if len(urls) > 0:
                responses = await fetch_urls(urls, batch_size)
                async yield responses
 async def main():
    batch_size = 5
    with open("agencies.csv", "r") as inf, 
        agencies = csv.DictReader(inf)
        # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
                    except Exception as e:
                        print(e)
                        out.writerow(
                            {
                                "id": agency["id"],
                                "url": agency["url"],
                                "status": "ERROR",
                            }
                        )
                        continue
                    result = ghost.scrape(page.text)
                    out.writerow(
                        result
                        + {"id": agency["id"], "url": agency["url"], "status": "OK"}
                    )
 if __name__ == "__main__":
    main()