From a628562f6ef3085c7498b7e6dea9c40da3b48efb Mon Sep 17 00:00:00 2001 From: James Turk Date: Mon, 16 Sep 2024 20:18:41 -0500 Subject: [PATCH] create master people db --- src/ossql/people_to_sqlite.py | 36 +++++++++++++++++++++++++++-------- src/ossql/schemas/common.py | 2 +- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/ossql/people_to_sqlite.py b/src/ossql/people_to_sqlite.py index 0daf9f6..3b07e7e 100644 --- a/src/ossql/people_to_sqlite.py +++ b/src/ossql/people_to_sqlite.py @@ -19,11 +19,16 @@ def load_people_yaml(dir_path: pathlib.Path): # ensure all tables exist db.create_tables([Person, PersonLink, PersonSource, PersonRole, PersonOffice]) + # output stats + unused = set() + created = 0 + files = list(dir_path.glob("*.yml")) - print(f"preparing to load {len(files)} files") + print(f"preparing to load {len(files)} files from {dir_path}") for file in files: pdata = yaml.safe_load(file.read_text()) + created += 1 person = Person.create( id=pdata.pop("id"), name=pdata.pop("name"), @@ -32,23 +37,23 @@ def load_people_yaml(dir_path: pathlib.Path): birth_date=pdata.pop("birth_date", None), gender=pdata.pop("gender"), email=pdata.pop("email", ""), - image=pdata.pop("image"), + image=pdata.pop("image", ""), party=pdata.pop("party"), extras=pdata.pop("extras", {}), ) - to_links(person, pdata.pop("links"), PersonLink) - to_links(person, pdata.pop("sources"), PersonSource) + to_links(person, pdata.pop("links", []), PersonLink) + to_links(person, pdata.pop("sources", []), PersonSource) for role in pdata.pop("roles"): PersonRole.create( person=person, jurisdiction=role.pop("jurisdiction"), - district=role.pop("district"), + district=role.pop("district", ""), type=role.pop('type'), start_date=role.pop('start_date', None), end_date=role.pop('end_date', None), ) - for office in pdata.pop("offices"): + for office in pdata.pop("offices", []): PersonOffice.create( person=person, classification=office.pop("classification"), @@ -59,10 +64,25 @@ def load_people_yaml(dir_path: pathlib.Path): # currently not using other_names, other_identifiers if pdata.keys(): - print(pdata.keys(), "left unused") + unused.update(pdata.keys()) + + if unused: + print(unused, "left unused") + + return created if __name__ == "__main__": path = pathlib.Path(sys.argv[1]) - load_people_yaml(path) + n_people = 0 + if path.name == "data": + # load all states + states = list(path.glob("??")) + for state in sorted(states): + n_people += load_people_yaml(state / "legislature") + else: + # exact path, one state + n_people += load_people_yaml(path) + + print(f"Created {n_people} people") diff --git a/src/ossql/schemas/common.py b/src/ossql/schemas/common.py index 6581089..ec651ef 100644 --- a/src/ossql/schemas/common.py +++ b/src/ossql/schemas/common.py @@ -2,7 +2,7 @@ from peewee import SqliteDatabase, Model from playhouse.sqlite_ext import SqliteExtDatabase db = SqliteExtDatabase('openstates.db', pragmas=( - ('cache_size', -1024 * 64), # 64MB page-cache. + ('cache_size', 1024 * 64), # 64MB page-cache. ('journal_mode', 'wal'), # Use WAL-mode (you should always use this!). ('foreign_keys', 1)) )