handle more response types
This commit is contained in:
parent
123f12257a
commit
61be304871
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
*.pyc
|
*.pyc
|
||||||
|
dist/
|
||||||
|
@ -41,10 +41,28 @@ def parse_headers(headers: list[str]) -> dict:
|
|||||||
return header_dict
|
return header_dict
|
||||||
|
|
||||||
|
|
||||||
def make_request(url, headers, postdata):
|
def make_request(url, *, headers, user_agent, postdata):
|
||||||
|
"""
|
||||||
|
Helper function for redundant code between methods.
|
||||||
|
|
||||||
|
Will take all parameters related to request and return
|
||||||
|
a 2-tuple of the raw response and the parsed response.
|
||||||
|
|
||||||
|
The second parameter may be a:
|
||||||
|
- JSON dict
|
||||||
|
- lxml.html.HtmlElement
|
||||||
|
- lxml.etree.Element
|
||||||
|
"""
|
||||||
header_dict = parse_headers(headers)
|
header_dict = parse_headers(headers)
|
||||||
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
|
method = "GET"
|
||||||
# if resp.headers["content-type"] == "text/html":
|
if postdata:
|
||||||
|
method = "POST"
|
||||||
|
resp = httpx.request(method, url, headers=header_dict, data=postdata)
|
||||||
|
if resp.headers["content-type"] == "application/json":
|
||||||
|
root = resp.json()
|
||||||
|
elif "xml" in resp.headers["content-type"]:
|
||||||
|
root = lxml.etree.fromstring(resp.content)
|
||||||
|
else:
|
||||||
root = lxml.html.fromstring(resp.text)
|
root = lxml.html.fromstring(resp.text)
|
||||||
return resp, root
|
return resp, root
|
||||||
|
|
||||||
@ -95,9 +113,18 @@ def query(
|
|||||||
xpath: Annotated[str, opt["xpath"]] = "",
|
xpath: Annotated[str, opt["xpath"]] = "",
|
||||||
):
|
):
|
||||||
"""Run a one-off query against the URL"""
|
"""Run a one-off query against the URL"""
|
||||||
resp, root = make_request(url, headers, postdata)
|
resp, root = make_request(
|
||||||
|
url, headers=headers, user_agent=user_agent, postdata=postdata
|
||||||
|
)
|
||||||
|
if not isinstance(root, lxml.html.HtmlElement):
|
||||||
|
typer.secho(f"Expecting HTML response, got:\n{root}", fg="red")
|
||||||
|
raise typer.Exit(1)
|
||||||
selector, selected = parse_selectors(root, css, xpath)
|
selector, selected = parse_selectors(root, css, xpath)
|
||||||
|
|
||||||
|
if selector is None:
|
||||||
|
typer.secho("Must provide either --css or --xpath to query", fg="red")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
for s in selected:
|
for s in selected:
|
||||||
print(s)
|
print(s)
|
||||||
|
|
||||||
@ -113,7 +140,9 @@ def shell(
|
|||||||
):
|
):
|
||||||
"""Launch an interactive Python shell for scraping"""
|
"""Launch an interactive Python shell for scraping"""
|
||||||
|
|
||||||
resp, root = make_request(url, headers, postdata)
|
resp, root = make_request(
|
||||||
|
url, headers=headers, user_agent=user_agent, postdata=postdata
|
||||||
|
)
|
||||||
selector, selected = parse_selectors(root, css, xpath)
|
selector, selected = parse_selectors(root, css, xpath)
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
@ -125,7 +154,13 @@ def shell(
|
|||||||
)
|
)
|
||||||
table.add_row("[green]url[/green]", url)
|
table.add_row("[green]url[/green]", url)
|
||||||
table.add_row("[green]resp[/green]", str(resp))
|
table.add_row("[green]resp[/green]", str(resp))
|
||||||
table.add_row("[green]root[/green]", "lxml.html.Element")
|
# if this list of parsed types expands there's probably a better way
|
||||||
|
if isinstance(root, lxml.html.HtmlElement):
|
||||||
|
table.add_row("[green]root[/green]", "lxml.html.HtmlElement")
|
||||||
|
elif isinstance(root, lxml.etree._Element):
|
||||||
|
table.add_row("[green]root[/green]", "lxml.etree.Element (XML)")
|
||||||
|
elif isinstance(root, dict):
|
||||||
|
table.add_row("[green]root[/green]", "dict (JSON)")
|
||||||
if selector:
|
if selector:
|
||||||
table.add_row("[green]selector[/green]", selector)
|
table.add_row("[green]selector[/green]", selector)
|
||||||
table.add_row("[green]selected[/green]", f"{len(selected)} elements")
|
table.add_row("[green]selected[/green]", f"{len(selected)} elements")
|
||||||
|
Loading…
Reference in New Issue
Block a user