From 3d0bee82e5bcf7ab0832872fc05bb14487173b28 Mon Sep 17 00:00:00 2001 From: James Turk Date: Wed, 16 Aug 2023 18:37:51 -0500 Subject: [PATCH] cleanup --- databeakers.toml | 3 ++ old.py | 82 ------------------------------------------------ 2 files changed, 3 insertions(+), 82 deletions(-) create mode 100644 databeakers.toml delete mode 100644 old.py diff --git a/databeakers.toml b/databeakers.toml new file mode 100644 index 0000000..3d4d78f --- /dev/null +++ b/databeakers.toml @@ -0,0 +1,3 @@ +[databeakers] +pipeline_path = "foiaghost.pipeline.recipe" +log_level = "info" \ No newline at end of file diff --git a/old.py b/old.py deleted file mode 100644 index 6deb4b3..0000000 --- a/old.py +++ /dev/null @@ -1,82 +0,0 @@ -# schema = { -# "public_records_email": "email", -# "public_records_address": "str", -# "public_records_phone": "555-555-5555", -# "public_records_fax": "555-555-5555", -# "public_records_web": "url", -# "general_contact_phone": "555-555-5555", -# "general_contact_address": "str", -# "foia_guide": "url", -# "public_reading_room": "url", -# "agency_logo": "url", -# } -# extra_instructions = """ -# The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests. -# The fields that begin with general_contact should refer to contact information for the agency in general. -# If a field is not found in the HTML, leave it as null in the JSON. -# """ - -# # create a scraper w/ a sqlite cache -# scraper = Scraper(requests_per_minute=600) -# scraper.cache_storage = SQLiteCache("cache.sqlite") - -# # create a scrapeghost -# ghost = SchemaScraper( -# schema=schema, -# extra_preprocessors=[], -# ) - - -# agencies = [] - - -# async def fetch_urls(urls): -# async with httpx.AsyncClient() as client: -# tasks = [client.get(url) for url in urls] -# responses = await asyncio.gather(*tasks) -# return responses - - -# async def worker(queue, batch_size): -# with open("results.csv", "w") as outf: -# out = csv.DictWriter( -# outf, fieldnames=["id", "url", "status"] + list(schema.keys()) -# ) -# while True: -# urls = [] -# for _ in range(batch_size): -# try: -# url = await queue.get() -# urls.append(url) -# except asyncio.QueueEmpty: -# break -# if len(urls) > 0: -# responses = await fetch_urls(urls, batch_size) -# async yield responses - - -# async def main(): -# batch_size = 5 - -# with open("agencies.csv", "r") as inf, -# agencies = csv.DictReader(inf) -# # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes -# except Exception as e: -# print(e) -# out.writerow( -# { -# "id": agency["id"], -# "url": agency["url"], -# "status": "ERROR", -# } -# ) -# continue -# result = ghost.scrape(page.text) -# out.writerow( -# result -# + {"id": agency["id"], "url": agency["url"], "status": "OK"} -# ) - - -# if __name__ == "__main__": -# main()