cleanup
This commit is contained in:
parent
30f0fce57a
commit
3d0bee82e5
3
databeakers.toml
Normal file
3
databeakers.toml
Normal file
@ -0,0 +1,3 @@
|
||||
[databeakers]
|
||||
pipeline_path = "foiaghost.pipeline.recipe"
|
||||
log_level = "info"
|
82
old.py
82
old.py
@ -1,82 +0,0 @@
|
||||
# schema = {
|
||||
# "public_records_email": "email",
|
||||
# "public_records_address": "str",
|
||||
# "public_records_phone": "555-555-5555",
|
||||
# "public_records_fax": "555-555-5555",
|
||||
# "public_records_web": "url",
|
||||
# "general_contact_phone": "555-555-5555",
|
||||
# "general_contact_address": "str",
|
||||
# "foia_guide": "url",
|
||||
# "public_reading_room": "url",
|
||||
# "agency_logo": "url",
|
||||
# }
|
||||
# extra_instructions = """
|
||||
# The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests.
|
||||
# The fields that begin with general_contact should refer to contact information for the agency in general.
|
||||
# If a field is not found in the HTML, leave it as null in the JSON.
|
||||
# """
|
||||
|
||||
# # create a scraper w/ a sqlite cache
|
||||
# scraper = Scraper(requests_per_minute=600)
|
||||
# scraper.cache_storage = SQLiteCache("cache.sqlite")
|
||||
|
||||
# # create a scrapeghost
|
||||
# ghost = SchemaScraper(
|
||||
# schema=schema,
|
||||
# extra_preprocessors=[],
|
||||
# )
|
||||
|
||||
|
||||
# agencies = []
|
||||
|
||||
|
||||
# async def fetch_urls(urls):
|
||||
# async with httpx.AsyncClient() as client:
|
||||
# tasks = [client.get(url) for url in urls]
|
||||
# responses = await asyncio.gather(*tasks)
|
||||
# return responses
|
||||
|
||||
|
||||
# async def worker(queue, batch_size):
|
||||
# with open("results.csv", "w") as outf:
|
||||
# out = csv.DictWriter(
|
||||
# outf, fieldnames=["id", "url", "status"] + list(schema.keys())
|
||||
# )
|
||||
# while True:
|
||||
# urls = []
|
||||
# for _ in range(batch_size):
|
||||
# try:
|
||||
# url = await queue.get()
|
||||
# urls.append(url)
|
||||
# except asyncio.QueueEmpty:
|
||||
# break
|
||||
# if len(urls) > 0:
|
||||
# responses = await fetch_urls(urls, batch_size)
|
||||
# async yield responses
|
||||
|
||||
|
||||
# async def main():
|
||||
# batch_size = 5
|
||||
|
||||
# with open("agencies.csv", "r") as inf,
|
||||
# agencies = csv.DictReader(inf)
|
||||
# # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes
|
||||
# except Exception as e:
|
||||
# print(e)
|
||||
# out.writerow(
|
||||
# {
|
||||
# "id": agency["id"],
|
||||
# "url": agency["url"],
|
||||
# "status": "ERROR",
|
||||
# }
|
||||
# )
|
||||
# continue
|
||||
# result = ghost.scrape(page.text)
|
||||
# out.writerow(
|
||||
# result
|
||||
# + {"id": agency["id"], "url": agency["url"], "status": "OK"}
|
||||
# )
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
Loading…
Reference in New Issue
Block a user