foiaghost, but actually beakers

This commit is contained in:
James Turk 2023-04-27 01:25:07 -05:00
commit 27c6cb0f8d
12 changed files with 1397 additions and 0 deletions

53
README.md Normal file
View File

@ -0,0 +1,53 @@
## Michael's Email
### Data Dictionay
`website` is supposed to be the homepage of the given agency. As you might know better than most people, that can be a surprisingly imprecise concept (they may only have a Facebook page, someone doing data entry might have picked a site that looked a lot like the official site but isn't really, the data might be out of data.).
`url` is supposed to the page dedicated to their public records submissions process or a FOIA portal if it exists. We started breaking out FOIA portals to their own model so those are mostly not included in this list. (One favorite: The official FOIA page of one agency was a PDF at a IP address)
• This export unfortunately did not include the current actual contacts (email, address, fax) for public records we have on file -- that's something we can pull separately if needed.
## Part One
### 1A
3,800 URLs listed for agencies' dedicated FOIA pages in `url`:
• Not AI, but is this website still good and available?
• If not, is there a URL it forwards to?
### 1B
{
"public_records_email": "email",
"public_records_address": "str",
"public_records_phone": "555-555-5555",
"public_records_fax": "555-555-5555",
"public_records_web": "url",
"general_contact_phone": "555-555-5555",
"general_contact_address": "str",
"foia_guide": "url",
"public_reading_room": "url",
"agency_logo": "url"
}
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
The fields that begin with general_contact should refer to contact information for the agency in general.
If a field is not found in the HTML, leave it as null in the JSON.
### Based on these Questions
• Does this page list a way for public records requests to submitted via email?
• ibid, but for mail, fax, web portal for submitting records requests?
• Is there a phone number listed to reach out with questions about FOIA requests?
• Is there a general contact phone number listed for this agency?
• Is there a general address listed for this agency?
• Is there a link to guide for FOIA or public records requesters?
• Is there a link to a public reading room or a place to browse documents or data posted by the agencies or view requests submitted by other people?
• Is there a logo for the agency?
But honestly, anything you might think interesting to poke at would be useful.
## Part Two
Would also be really interested to see how well it could identify public records and FOIA pages for agencies if given the website, since we have a lot more of those, but having it be able to scrape for potentially updated contacts would be huge.
Also, in case it's of interest, we've been working to integrate a tool called Klaxon (http://www.newsklaxon.org) into our portfolio and help it scale up. It monitors web pages for changes in a specified HTML element, so you could say just alert me when there's changes in the <body> or just documents added within a specific div. One thing I've been thinking about is setting up Klaxon to look for changes to a specified area of a page, then setting up a secondary scraper that's triggered that can do something more heavy duty as warranted.

1071
poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

16
pyproject.toml Normal file
View File

@ -0,0 +1,16 @@
[tool.poetry]
name = "foiaghost"
version = "0.1.0"
description = ""
authors = ["James Turk <dev@jamesturk.net>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "^3.11"
scrapeghost = {path = "../scrapeghost", develop = true}
scrapelib = "^2.1.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

0
src/beakers/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

102
src/beakers/beaker.py Normal file
View File

@ -0,0 +1,102 @@
import sqlite3
import csv
import json
import hashlib
from structlog import get_logger
log = get_logger()
class Beaker:
def __init__(self, table_name: str, db=None):
self.table_name = table_name
self.db = db if db else sqlite3.connect(f"beaker.db")
self.cursor = self.db.cursor()
self.cursor.row_factory = sqlite3.Row
# create table if it doesn't exist
self._init_metadata()
self.cursor.execute(
f"CREATE TABLE IF NOT EXISTS {table_name} (id INTEGER PRIMARY KEY, data JSON, from_table TEXT NULL, from_id INTEGER NULL)"
)
def __str__(self):
return f"Beaker({self.table_name})"
__repr__ = __str__
def __iter__(self):
self.cursor.execute(f"SELECT data FROM {self.table_name}")
data = self.cursor.fetchall()
for item in data:
print(item)
yield item["id"], item["data"]
def __len__(self):
self.cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
return self.cursor.fetchone()[0]
def _init_metadata(self):
self.cursor.execute(
"CREATE TABLE IF NOT EXISTS _metadata (id INTEGER PRIMARY KEY, table_name TEXT, data JSON)"
)
self.cursor.execute(
"INSERT INTO _metadata (table_name, data) VALUES (?, ?)",
(self.table_name, json.dumps({})),
)
self.db.commit()
def get_metadata(self) -> dict:
self.cursor.execute(
"SELECT data FROM _metadata WHERE table_name = ?",
(self.table_name,),
)
data = self.cursor.fetchone()["data"]
return json.loads(data)
def save_metadata(self, data: dict) -> None:
self.cursor.execute(
"UPDATE _metadata SET data = ? WHERE table_name = ?",
(json.dumps(data), self.table_name),
)
self.db.commit()
def add_item(self, item: dict, from_table=None, from_id=None) -> None:
self.cursor.execute(
f"INSERT INTO {self.table_name} (data) VALUES (?)", (json.dumps(item),)
)
self.db.commit()
@classmethod
def from_csv(cls, table_name, filename: str) -> None:
beaker = cls(table_name)
lg = log.bind(table_name=table_name, filename=filename)
# three cases: empty, match, mismatch
# case 1: empty
if len(beaker) == 0:
with open(filename, "r") as file:
reader = csv.DictReader(file)
added = 0
for row in reader:
beaker.add_item(row)
added += 1
lg.info("from_csv", case="empty", added=added)
meta = beaker.get_metadata()
meta["sha512"] = get_sha512(filename)
beaker.save_metadata(meta)
else:
old_sha = beaker.get_metadata().get("sha512")
new_sha = get_sha512(filename)
if old_sha != new_sha:
# case 3: mismatch
lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha)
raise Exception("sha512 mismatch")
else:
# case 2: match
log.info("from_csv", case="match")
return beaker
def get_sha512(filename: str) -> str:
with open(filename, "rb") as file:
return hashlib.sha512(file.read()).hexdigest()

41
src/beakers/recipe.py Normal file
View File

@ -0,0 +1,41 @@
from .beaker import Beaker
from structlog import get_logger
log = get_logger()
class Pour:
def __init__(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
self.from_beaker = from_beaker
self.to_beaker = to_beaker
self.transform = transform
class Recipe:
def __init__(self, name: str):
self.name = name
self.pours = []
def __str__(self) -> str:
return f"Recipe({self.name})"
__repr__ = __str__
def add_pour(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
pour = Pour(from_beaker, to_beaker, transform)
self.pours.append(pour)
def run_linearly(self):
log.info("recipe", recipe=self)
for pour in self.pours:
print(pour.from_beaker, pour.to_beaker, pour.transform)
log.info(
"pour",
from_beaker=pour.from_beaker,
to_beaker=pour.to_beaker,
to_pour=len(pour.from_beaker),
)
for id, item in pour.from_beaker:
log.info("pour_item", id=id, item=item)
transformed = pour.transform(item)
pour.to_beaker.add_item(transformed, pour.from_beaker.table_name, id)

24
src/example.py Normal file
View File

@ -0,0 +1,24 @@
import csv
from beakers.recipe import Recipe
from beakers.beaker import Beaker
urls = Beaker("urls")
responses = Beaker("responses")
async def add_response(obj_with_url):
url = obj_with_url["url"]
response = await httpx.get(url)
return {
"url": url,
"status_code": response.status_code,
"response_body": response.text,
}
agencies = Beaker.from_csv("agencies", "agencies.csv")
responses = Beaker("responses")
recipe = Recipe("fetch urls")
recipe.add_pour(agencies, responses, add_response)
recipe.run_linearly()

90
src/foiaghost.py Normal file
View File

@ -0,0 +1,90 @@
import asyncio
import httpx
import csv
from asyncio import Queue
from itertools import zip_longest
from scrapelib import Scraper, SQLiteCache
from scrapeghost import SchemaScraper, CSS
schema = {
"public_records_email": "email",
"public_records_address": "str",
"public_records_phone": "555-555-5555",
"public_records_fax": "555-555-5555",
"public_records_web": "url",
"general_contact_phone": "555-555-5555",
"general_contact_address": "str",
"foia_guide": "url",
"public_reading_room": "url",
"agency_logo": "url",
}
extra_instructions = """
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
The fields that begin with general_contact should refer to contact information for the agency in general.
If a field is not found in the HTML, leave it as null in the JSON.
"""
# create a scraper w/ a sqlite cache
scraper = Scraper(requests_per_minute=600)
scraper.cache_storage = SQLiteCache("cache.sqlite")
# create a scrapeghost
ghost = SchemaScraper(
schema=schema,
extra_preprocessors=[],
)
agencies = []
async def fetch_urls(urls):
async with httpx.AsyncClient() as client:
tasks = [client.get(url) for url in urls]
responses = await asyncio.gather(*tasks)
return responses
async def worker(queue, batch_size):
with open("results.csv", "w") as outf:
out = csv.DictWriter(
outf, fieldnames=["id", "url", "status"] + list(schema.keys())
)
while True:
urls = []
for _ in range(batch_size):
try:
url = await queue.get()
urls.append(url)
except asyncio.QueueEmpty:
break
if len(urls) > 0:
responses = await fetch_urls(urls, batch_size)
async yield responses
async def main():
batch_size = 5
with open("agencies.csv", "r") as inf,
agencies = csv.DictReader(inf)
# grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
except Exception as e:
print(e)
out.writerow(
{
"id": agency["id"],
"url": agency["url"],
"status": "ERROR",
}
)
continue
result = ghost.scrape(page.text)
out.writerow(
result
+ {"id": agency["id"], "url": agency["url"], "status": "OK"}
)
if __name__ == "__main__":
main()