cleanup

2023-08-16 18:37:51 -05:00 · 2023-08-16 18:37:51 -05:00 · 3d0bee82e5
commit 3d0bee82e5
parent 30f0fce57a
2 changed files with 3 additions and 82 deletions
--- a/databeakers.toml
+++ b/databeakers.toml
@ -0,0 +1,3 @@
+[databeakers]
+pipeline_path = "foiaghost.pipeline.recipe"
+log_level = "info"
--- a/old.py
+++ b/old.py
@ -1,82 +0,0 @@
-# schema = {
-#     "public_records_email": "email",
-#     "public_records_address": "str",
-#     "public_records_phone": "555-555-5555",
-#     "public_records_fax": "555-555-5555",
-#     "public_records_web": "url",
-#     "general_contact_phone": "555-555-5555",
-#     "general_contact_address": "str",
-#     "foia_guide": "url",
-#     "public_reading_room": "url",
-#     "agency_logo": "url",
-# }
-# extra_instructions = """
-# The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
-# The fields that begin with general_contact should refer to contact information for the agency in general.
-# If a field is not found in the HTML, leave it as null in the JSON.
-# """
-
-# # create a scraper w/ a sqlite cache
-# scraper = Scraper(requests_per_minute=600)
-# scraper.cache_storage = SQLiteCache("cache.sqlite")
-
-# # create a scrapeghost
-# ghost = SchemaScraper(
-#     schema=schema,
-#     extra_preprocessors=[],
-# )
-
-
-# agencies = []
-
-
-# async def fetch_urls(urls):
-#     async with httpx.AsyncClient() as client:
-#         tasks = [client.get(url) for url in urls]
-#         responses = await asyncio.gather(*tasks)
-#         return responses
-
-
-# async def worker(queue, batch_size):
-#     with open("results.csv", "w") as outf:
-#         out = csv.DictWriter(
-#             outf, fieldnames=["id", "url", "status"] + list(schema.keys())
-#         )
-#         while True:
-#             urls = []
-#             for _ in range(batch_size):
-#                 try:
-#                     url = await queue.get()
-#                     urls.append(url)
-#                 except asyncio.QueueEmpty:
-#                     break
-#             if len(urls) > 0:
-#                 responses = await fetch_urls(urls, batch_size)
-#                 async yield responses
-
-
-# async def main():
-#     batch_size = 5
-
-#     with open("agencies.csv", "r") as inf,
-#         agencies = csv.DictReader(inf)
-#         # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
-#                     except Exception as e:
-#                         print(e)
-#                         out.writerow(
-#                             {
-#                                 "id": agency["id"],
-#                                 "url": agency["url"],
-#                                 "status": "ERROR",
-#                             }
-#                         )
-#                         continue
-#                     result = ghost.scrape(page.text)
-#                     out.writerow(
-#                         result
-#                         + {"id": agency["id"], "url": agency["url"], "status": "OK"}
-#                     )
-
-
-# if __name__ == "__main__":
-#     main()