import asyncio import httpx import csv from asyncio import Queue from itertools import zip_longest from scrapelib import Scraper, SQLiteCache from scrapeghost import SchemaScraper, CSS schema = { "public_records_email": "email", "public_records_address": "str", "public_records_phone": "555-555-5555", "public_records_fax": "555-555-5555", "public_records_web": "url", "general_contact_phone": "555-555-5555", "general_contact_address": "str", "foia_guide": "url", "public_reading_room": "url", "agency_logo": "url", } extra_instructions = """ The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests. The fields that begin with general_contact should refer to contact information for the agency in general. If a field is not found in the HTML, leave it as null in the JSON. """ # create a scraper w/ a sqlite cache scraper = Scraper(requests_per_minute=600) scraper.cache_storage = SQLiteCache("cache.sqlite") # create a scrapeghost ghost = SchemaScraper( schema=schema, extra_preprocessors=[], ) agencies = [] async def fetch_urls(urls): async with httpx.AsyncClient() as client: tasks = [client.get(url) for url in urls] responses = await asyncio.gather(*tasks) return responses async def worker(queue, batch_size): with open("results.csv", "w") as outf: out = csv.DictWriter( outf, fieldnames=["id", "url", "status"] + list(schema.keys()) ) while True: urls = [] for _ in range(batch_size): try: url = await queue.get() urls.append(url) except asyncio.QueueEmpty: break if len(urls) > 0: responses = await fetch_urls(urls, batch_size) async yield responses async def main(): batch_size = 5 with open("agencies.csv", "r") as inf, agencies = csv.DictReader(inf) # grouper -> https://docs.python.org/3/library/itertools.html#itertools-recipes except Exception as e: print(e) out.writerow( { "id": agency["id"], "url": agency["url"], "status": "ERROR", } ) continue result = ghost.scrape(page.text) out.writerow( result + {"id": agency["id"], "url": agency["url"], "status": "OK"} ) if __name__ == "__main__": main()