foiaghost tiktoken counts
This commit is contained in:
		
							parent
							
								
									4c1392abff
								
							
						
					
					
						commit
						20f9809689
					
				
					 2 changed files with 42 additions and 9 deletions
				
			
		|  | @ -1,7 +1,12 @@ | |||
| from ssl import SSLCertVerificationError, SSLError | ||||
| import httpx | ||||
| import tiktoken | ||||
| import lxml.html | ||||
| from lxml.etree import ParserError | ||||
| from beakers.beakers import Beaker | ||||
| from beakers.recipe import Recipe | ||||
| from scrapeghost import SchemaScraper | ||||
| from scrapeghost.preprocessors import CleanHTML | ||||
| 
 | ||||
| 
 | ||||
| async def add_response(obj_with_url): | ||||
|  | @ -16,6 +21,25 @@ async def add_response(obj_with_url): | |||
|     } | ||||
| 
 | ||||
| 
 | ||||
| def tiktoken_count(response): | ||||
|     if response["status_code"] != 200: | ||||
|         raise ValueError("response status code is not 200") | ||||
| 
 | ||||
|     html = response["response_body"] | ||||
| 
 | ||||
|     # clean the html | ||||
|     cleaner = CleanHTML() | ||||
|     encoding = tiktoken.get_encoding("cl100k_base") | ||||
|     doc = lxml.html.fromstring(html) | ||||
|     (doc,) = cleaner(doc)  # returns a 1-item list | ||||
|     html_again = lxml.html.tostring(doc, encoding="unicode") | ||||
|     tokens = len(encoding.encode(html_again)) | ||||
| 
 | ||||
|     response["tiktoken_count"] = tokens | ||||
| 
 | ||||
|     return response | ||||
| 
 | ||||
| 
 | ||||
| # current thinking, beakers exist within a recipe | ||||
| recipe = Recipe("fetch urls", "url_example.db") | ||||
| recipe.add_beaker("agencies") | ||||
|  | @ -23,6 +47,11 @@ recipe.add_beaker("responses") | |||
| recipe.add_beaker("bad_requests") | ||||
| recipe.add_beaker("good_urls", temp=True) | ||||
| recipe.add_beaker("missing_urls", temp=True) | ||||
| recipe.add_beaker("with_tiktoken_count") | ||||
| recipe.add_beaker("no_tiktoken_count", temp=True) | ||||
| recipe.add_beaker("token_lt_8k", temp=True) | ||||
| recipe.add_beaker("token_gt_8k", temp=True) | ||||
| 
 | ||||
| recipe.add_conditional( | ||||
|     "agencies", | ||||
|     lambda x: x["url"].startswith("http"), | ||||
|  | @ -41,3 +70,15 @@ recipe.add_transform( | |||
|         ): "bad_requests" | ||||
|     }, | ||||
| ) | ||||
| recipe.add_transform( | ||||
|     "responses", | ||||
|     "with_tiktoken_count", | ||||
|     tiktoken_count, | ||||
|     error_map={(ValueError, ParserError): "no_tiktoken_count"}, | ||||
| ) | ||||
| recipe.add_conditional( | ||||
|     "with_tiktoken_count", | ||||
|     lambda x: x["tiktoken_count"] < 8000, | ||||
|     if_true="token_lt_8k", | ||||
|     if_false="token_gt_8k", | ||||
| ) | ||||
|  |  | |||
							
								
								
									
										10
									
								
								foiaghost.py
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								foiaghost.py
									
									
									
									
									
								
							|  | @ -1,11 +1,3 @@ | |||
| import asyncio | ||||
| import httpx | ||||
| import csv | ||||
| from asyncio import Queue | ||||
| from itertools import zip_longest | ||||
| from scrapelib import Scraper, SQLiteCache | ||||
| from scrapeghost import SchemaScraper, CSS | ||||
| 
 | ||||
| schema = { | ||||
|     "public_records_email": "email", | ||||
|     "public_records_address": "str", | ||||
|  | @ -19,7 +11,7 @@ schema = { | |||
|     "agency_logo": "url", | ||||
| } | ||||
| extra_instructions = """ | ||||
| The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedome of Information requests. | ||||
| The fields that begin with public_records should refer to contact information specific to FOIA/Public Information/Freedom of Information requests. | ||||
| The fields that begin with general_contact should refer to contact information for the agency in general. | ||||
| If a field is not found in the HTML, leave it as null in the JSON. | ||||
| """ | ||||
|  |  | |||
		Loading…
	
		Reference in a new issue
	
	 James Turk
						James Turk