foiaghost, but actually beakers
This commit is contained in:
commit
27c6cb0f8d
53
README.md
Normal file
53
README.md
Normal file
@ -0,0 +1,53 @@
|
||||
## Michael's Email
|
||||
|
||||
### Data Dictionay
|
||||
|
||||
• `website` is supposed to be the homepage of the given agency. As you might know better than most people, that can be a surprisingly imprecise concept (they may only have a Facebook page, someone doing data entry might have picked a site that looked a lot like the official site but isn't really, the data might be out of data.).
|
||||
• `url` is supposed to the page dedicated to their public records submissions process or a FOIA portal if it exists. We started breaking out FOIA portals to their own model so those are mostly not included in this list. (One favorite: The official FOIA page of one agency was a PDF at a IP address)
|
||||
• This export unfortunately did not include the current actual contacts (email, address, fax) for public records we have on file -- that's something we can pull separately if needed.
|
||||
|
||||
## Part One
|
||||
|
||||
### 1A
|
||||
|
||||
3,800 URLs listed for agencies' dedicated FOIA pages in `url`:
|
||||
• Not AI, but is this website still good and available?
|
||||
• If not, is there a URL it forwards to?
|
||||
|
||||
### 1B
|
||||
|
||||
{
|
||||
"public_records_email": "email",
|
||||
"public_records_address": "str",
|
||||
"public_records_phone": "555-555-5555",
|
||||
"public_records_fax": "555-555-5555",
|
||||
"public_records_web": "url",
|
||||
"general_contact_phone": "555-555-5555",
|
||||
"general_contact_address": "str",
|
||||
"foia_guide": "url",
|
||||
"public_reading_room": "url",
|
||||
"agency_logo": "url"
|
||||
}
|
||||
|
||||
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
|
||||
The fields that begin with general_contact should refer to contact information for the agency in general.
|
||||
If a field is not found in the HTML, leave it as null in the JSON.
|
||||
|
||||
### Based on these Questions
|
||||
|
||||
• Does this page list a way for public records requests to submitted via email?
|
||||
• ibid, but for mail, fax, web portal for submitting records requests?
|
||||
• Is there a phone number listed to reach out with questions about FOIA requests?
|
||||
• Is there a general contact phone number listed for this agency?
|
||||
• Is there a general address listed for this agency?
|
||||
• Is there a link to guide for FOIA or public records requesters?
|
||||
• Is there a link to a public reading room or a place to browse documents or data posted by the agencies or view requests submitted by other people?
|
||||
• Is there a logo for the agency?
|
||||
|
||||
But honestly, anything you might think interesting to poke at would be useful.
|
||||
|
||||
## Part Two
|
||||
|
||||
Would also be really interested to see how well it could identify public records and FOIA pages for agencies if given the website, since we have a lot more of those, but having it be able to scrape for potentially updated contacts would be huge.
|
||||
|
||||
Also, in case it's of interest, we've been working to integrate a tool called Klaxon (http://www.newsklaxon.org) into our portfolio and help it scale up. It monitors web pages for changes in a specified HTML element, so you could say just alert me when there's changes in the <body> or just documents added within a specific div. One thing I've been thinking about is setting up Klaxon to look for changes to a specified area of a page, then setting up a secondary scraper that's triggered that can do something more heavy duty as warranted.
|
1071
poetry.lock
generated
Normal file
1071
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
16
pyproject.toml
Normal file
16
pyproject.toml
Normal file
@ -0,0 +1,16 @@
|
||||
[tool.poetry]
|
||||
name = "foiaghost"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["James Turk <dev@jamesturk.net>"]
|
||||
readme = "README.md"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.11"
|
||||
scrapeghost = {path = "../scrapeghost", develop = true}
|
||||
scrapelib = "^2.1.0"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
0
src/beakers/__init__.py
Normal file
0
src/beakers/__init__.py
Normal file
BIN
src/beakers/__pycache__/__init__.cpython-311.pyc
Normal file
BIN
src/beakers/__pycache__/__init__.cpython-311.pyc
Normal file
Binary file not shown.
BIN
src/beakers/__pycache__/beaker.cpython-311.pyc
Normal file
BIN
src/beakers/__pycache__/beaker.cpython-311.pyc
Normal file
Binary file not shown.
BIN
src/beakers/__pycache__/recipe.cpython-311.pyc
Normal file
BIN
src/beakers/__pycache__/recipe.cpython-311.pyc
Normal file
Binary file not shown.
BIN
src/beakers/__pycache__/sources.cpython-311.pyc
Normal file
BIN
src/beakers/__pycache__/sources.cpython-311.pyc
Normal file
Binary file not shown.
102
src/beakers/beaker.py
Normal file
102
src/beakers/beaker.py
Normal file
@ -0,0 +1,102 @@
|
||||
import sqlite3
|
||||
import csv
|
||||
import json
|
||||
import hashlib
|
||||
|
||||
from structlog import get_logger
|
||||
|
||||
log = get_logger()
|
||||
|
||||
|
||||
class Beaker:
|
||||
def __init__(self, table_name: str, db=None):
|
||||
self.table_name = table_name
|
||||
self.db = db if db else sqlite3.connect(f"beaker.db")
|
||||
self.cursor = self.db.cursor()
|
||||
self.cursor.row_factory = sqlite3.Row
|
||||
# create table if it doesn't exist
|
||||
self._init_metadata()
|
||||
self.cursor.execute(
|
||||
f"CREATE TABLE IF NOT EXISTS {table_name} (id INTEGER PRIMARY KEY, data JSON, from_table TEXT NULL, from_id INTEGER NULL)"
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return f"Beaker({self.table_name})"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def __iter__(self):
|
||||
self.cursor.execute(f"SELECT data FROM {self.table_name}")
|
||||
data = self.cursor.fetchall()
|
||||
for item in data:
|
||||
print(item)
|
||||
yield item["id"], item["data"]
|
||||
|
||||
def __len__(self):
|
||||
self.cursor.execute(f"SELECT COUNT(*) FROM {self.table_name}")
|
||||
return self.cursor.fetchone()[0]
|
||||
|
||||
def _init_metadata(self):
|
||||
self.cursor.execute(
|
||||
"CREATE TABLE IF NOT EXISTS _metadata (id INTEGER PRIMARY KEY, table_name TEXT, data JSON)"
|
||||
)
|
||||
self.cursor.execute(
|
||||
"INSERT INTO _metadata (table_name, data) VALUES (?, ?)",
|
||||
(self.table_name, json.dumps({})),
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
def get_metadata(self) -> dict:
|
||||
self.cursor.execute(
|
||||
"SELECT data FROM _metadata WHERE table_name = ?",
|
||||
(self.table_name,),
|
||||
)
|
||||
data = self.cursor.fetchone()["data"]
|
||||
return json.loads(data)
|
||||
|
||||
def save_metadata(self, data: dict) -> None:
|
||||
self.cursor.execute(
|
||||
"UPDATE _metadata SET data = ? WHERE table_name = ?",
|
||||
(json.dumps(data), self.table_name),
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
def add_item(self, item: dict, from_table=None, from_id=None) -> None:
|
||||
self.cursor.execute(
|
||||
f"INSERT INTO {self.table_name} (data) VALUES (?)", (json.dumps(item),)
|
||||
)
|
||||
self.db.commit()
|
||||
|
||||
@classmethod
|
||||
def from_csv(cls, table_name, filename: str) -> None:
|
||||
beaker = cls(table_name)
|
||||
lg = log.bind(table_name=table_name, filename=filename)
|
||||
# three cases: empty, match, mismatch
|
||||
# case 1: empty
|
||||
if len(beaker) == 0:
|
||||
with open(filename, "r") as file:
|
||||
reader = csv.DictReader(file)
|
||||
added = 0
|
||||
for row in reader:
|
||||
beaker.add_item(row)
|
||||
added += 1
|
||||
lg.info("from_csv", case="empty", added=added)
|
||||
meta = beaker.get_metadata()
|
||||
meta["sha512"] = get_sha512(filename)
|
||||
beaker.save_metadata(meta)
|
||||
else:
|
||||
old_sha = beaker.get_metadata().get("sha512")
|
||||
new_sha = get_sha512(filename)
|
||||
if old_sha != new_sha:
|
||||
# case 3: mismatch
|
||||
lg.info("from_csv", case="mismatch", old_sha=old_sha, new_sha=new_sha)
|
||||
raise Exception("sha512 mismatch")
|
||||
else:
|
||||
# case 2: match
|
||||
log.info("from_csv", case="match")
|
||||
return beaker
|
||||
|
||||
|
||||
def get_sha512(filename: str) -> str:
|
||||
with open(filename, "rb") as file:
|
||||
return hashlib.sha512(file.read()).hexdigest()
|
41
src/beakers/recipe.py
Normal file
41
src/beakers/recipe.py
Normal file
@ -0,0 +1,41 @@
|
||||
from .beaker import Beaker
|
||||
from structlog import get_logger
|
||||
|
||||
log = get_logger()
|
||||
|
||||
|
||||
class Pour:
|
||||
def __init__(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
|
||||
self.from_beaker = from_beaker
|
||||
self.to_beaker = to_beaker
|
||||
self.transform = transform
|
||||
|
||||
|
||||
class Recipe:
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
self.pours = []
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Recipe({self.name})"
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
def add_pour(self, from_beaker: Beaker, to_beaker: Beaker, transform: callable):
|
||||
pour = Pour(from_beaker, to_beaker, transform)
|
||||
self.pours.append(pour)
|
||||
|
||||
def run_linearly(self):
|
||||
log.info("recipe", recipe=self)
|
||||
for pour in self.pours:
|
||||
print(pour.from_beaker, pour.to_beaker, pour.transform)
|
||||
log.info(
|
||||
"pour",
|
||||
from_beaker=pour.from_beaker,
|
||||
to_beaker=pour.to_beaker,
|
||||
to_pour=len(pour.from_beaker),
|
||||
)
|
||||
for id, item in pour.from_beaker:
|
||||
log.info("pour_item", id=id, item=item)
|
||||
transformed = pour.transform(item)
|
||||
pour.to_beaker.add_item(transformed, pour.from_beaker.table_name, id)
|
24
src/example.py
Normal file
24
src/example.py
Normal file
@ -0,0 +1,24 @@
|
||||
import csv
|
||||
from beakers.recipe import Recipe
|
||||
from beakers.beaker import Beaker
|
||||
|
||||
urls = Beaker("urls")
|
||||
responses = Beaker("responses")
|
||||
|
||||
|
||||
async def add_response(obj_with_url):
|
||||
url = obj_with_url["url"]
|
||||
response = await httpx.get(url)
|
||||
return {
|
||||
"url": url,
|
||||
"status_code": response.status_code,
|
||||
"response_body": response.text,
|
||||
}
|
||||
|
||||
|
||||
agencies = Beaker.from_csv("agencies", "agencies.csv")
|
||||
responses = Beaker("responses")
|
||||
recipe = Recipe("fetch urls")
|
||||
recipe.add_pour(agencies, responses, add_response)
|
||||
|
||||
recipe.run_linearly()
|
90
src/foiaghost.py
Normal file
90
src/foiaghost.py
Normal file
@ -0,0 +1,90 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import csv
|
||||
from asyncio import Queue
|
||||
from itertools import zip_longest
|
||||
from scrapelib import Scraper, SQLiteCache
|
||||
from scrapeghost import SchemaScraper, CSS
|
||||
|
||||
schema = {
|
||||
"public_records_email": "email",
|
||||
"public_records_address": "str",
|
||||
"public_records_phone": "555-555-5555",
|
||||
"public_records_fax": "555-555-5555",
|
||||
"public_records_web": "url",
|
||||
"general_contact_phone": "555-555-5555",
|
||||
"general_contact_address": "str",
|
||||
"foia_guide": "url",
|
||||
"public_reading_room": "url",
|
||||
"agency_logo": "url",
|
||||
}
|
||||
extra_instructions = """
|
||||
The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests.
|
||||
The fields that begin with general_contact should refer to contact information for the agency in general.
|
||||
If a field is not found in the HTML, leave it as null in the JSON.
|
||||
"""
|
||||
|
||||
# create a scraper w/ a sqlite cache
|
||||
scraper = Scraper(requests_per_minute=600)
|
||||
scraper.cache_storage = SQLiteCache("cache.sqlite")
|
||||
|
||||
# create a scrapeghost
|
||||
ghost = SchemaScraper(
|
||||
schema=schema,
|
||||
extra_preprocessors=[],
|
||||
)
|
||||
|
||||
|
||||
agencies = []
|
||||
|
||||
|
||||
async def fetch_urls(urls):
|
||||
async with httpx.AsyncClient() as client:
|
||||
tasks = [client.get(url) for url in urls]
|
||||
responses = await asyncio.gather(*tasks)
|
||||
return responses
|
||||
|
||||
|
||||
async def worker(queue, batch_size):
|
||||
with open("results.csv", "w") as outf:
|
||||
out = csv.DictWriter(
|
||||
outf, fieldnames=["id", "url", "status"] + list(schema.keys())
|
||||
)
|
||||
while True:
|
||||
urls = []
|
||||
for _ in range(batch_size):
|
||||
try:
|
||||
url = await queue.get()
|
||||
urls.append(url)
|
||||
except asyncio.QueueEmpty:
|
||||
break
|
||||
if len(urls) > 0:
|
||||
responses = await fetch_urls(urls, batch_size)
|
||||
async yield responses
|
||||
|
||||
|
||||
async def main():
|
||||
batch_size = 5
|
||||
|
||||
with open("agencies.csv", "r") as inf,
|
||||
agencies = csv.DictReader(inf)
|
||||
# grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
|
||||
except Exception as e:
|
||||
print(e)
|
||||
out.writerow(
|
||||
{
|
||||
"id": agency["id"],
|
||||
"url": agency["url"],
|
||||
"status": "ERROR",
|
||||
}
|
||||
)
|
||||
continue
|
||||
result = ghost.scrape(page.text)
|
||||
out.writerow(
|
||||
result
|
||||
+ {"id": agency["id"], "url": agency["url"], "status": "OK"}
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user