handle more response types

This commit is contained in:
James Turk 2025-01-26 10:46:12 -06:00
parent 123f12257a
commit 61be304871
2 changed files with 43 additions and 7 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
*.pyc
dist/

View File

@ -41,11 +41,29 @@ def parse_headers(headers: list[str]) -> dict:
return header_dict
def make_request(url, headers, postdata):
def make_request(url, *, headers, user_agent, postdata):
"""
Helper function for redundant code between methods.
Will take all parameters related to request and return
a 2-tuple of the raw response and the parsed response.
The second parameter may be a:
- JSON dict
- lxml.html.HtmlElement
- lxml.etree.Element
"""
header_dict = parse_headers(headers)
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
# if resp.headers["content-type"] == "text/html":
root = lxml.html.fromstring(resp.text)
method = "GET"
if postdata:
method = "POST"
resp = httpx.request(method, url, headers=header_dict, data=postdata)
if resp.headers["content-type"] == "application/json":
root = resp.json()
elif "xml" in resp.headers["content-type"]:
root = lxml.etree.fromstring(resp.content)
else:
root = lxml.html.fromstring(resp.text)
return resp, root
@ -95,9 +113,18 @@ def query(
xpath: Annotated[str, opt["xpath"]] = "",
):
"""Run a one-off query against the URL"""
resp, root = make_request(url, headers, postdata)
resp, root = make_request(
url, headers=headers, user_agent=user_agent, postdata=postdata
)
if not isinstance(root, lxml.html.HtmlElement):
typer.secho(f"Expecting HTML response, got:\n{root}", fg="red")
raise typer.Exit(1)
selector, selected = parse_selectors(root, css, xpath)
if selector is None:
typer.secho("Must provide either --css or --xpath to query", fg="red")
raise typer.Exit(1)
for s in selected:
print(s)
@ -113,7 +140,9 @@ def shell(
):
"""Launch an interactive Python shell for scraping"""
resp, root = make_request(url, headers, postdata)
resp, root = make_request(
url, headers=headers, user_agent=user_agent, postdata=postdata
)
selector, selected = parse_selectors(root, css, xpath)
console = Console()
@ -125,7 +154,13 @@ def shell(
)
table.add_row("[green]url[/green]", url)
table.add_row("[green]resp[/green]", str(resp))
table.add_row("[green]root[/green]", "lxml.html.Element")
# if this list of parsed types expands there's probably a better way
if isinstance(root, lxml.html.HtmlElement):
table.add_row("[green]root[/green]", "lxml.html.HtmlElement")
elif isinstance(root, lxml.etree._Element):
table.add_row("[green]root[/green]", "lxml.etree.Element (XML)")
elif isinstance(root, dict):
table.add_row("[green]root[/green]", "dict (JSON)")
if selector:
table.add_row("[green]selector[/green]", selector)
table.add_row("[green]selected[/green]", f"{len(selected)} elements")