This commit is contained in:
James Turk 2025-01-26 03:37:28 -06:00
parent 766dc5d8e8
commit 123f12257a

View File

@ -14,6 +14,7 @@ cli = typer.Typer(help="whsk: web harvesting/scraping toolKit")
VERSION = version("whsk") VERSION = version("whsk")
_user_agent = f"whsk/{VERSION}" _user_agent = f"whsk/{VERSION}"
# Common Options
opt = { opt = {
"user_agent": typer.Option("--ua", help="User agent to make requests with"), "user_agent": typer.Option("--ua", help="User agent to make requests with"),
"postdata": typer.Option( "postdata": typer.Option(
@ -40,18 +41,65 @@ def parse_headers(headers: list[str]) -> dict:
return header_dict return header_dict
def make_request(url, headers, postdata):
header_dict = parse_headers(headers)
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
# if resp.headers["content-type"] == "text/html":
root = lxml.html.fromstring(resp.text)
return resp, root
def parse_selectors(root, css, xpath):
# check for a selector
selected = selector = None
if css and xpath:
typer.secho("Cannot specify css and xpath", fg="red")
raise typer.Exit(1)
if css:
selector = css
selected = root.cssselect(css)
if xpath:
selector = xpath
selected = root.xpath(xpath)
return selector, selected
@cli.command()
def version():
pyversion = sys.version.split(" ")[0]
console = Console()
console.print(
Panel(
f"""
W H H H SS K K
W W W HHH S KK
WWWWW H H SS K K v{VERSION}
""".lstrip()
+ f"\npython {pyversion:>23}"
f"\nipython {IPython.__version__:>22}"
f"\nlxml.html {lxml.__version__:>20}"
f"\nhttpx {httpx.__version__:>24}",
style="cyan",
expand=False,
)
)
@cli.command() @cli.command()
def query( def query(
url: Annotated[str, typer.Argument(help="URL to scrape")], url: Annotated[str, typer.Argument(help="URL to scrape")],
user_agent: Annotated[str, opt["user_agent"]] = _user_agent, user_agent: Annotated[str, opt["user_agent"]] = _user_agent,
postdata: Annotated[str, opt["postdata"]] = "", postdata: Annotated[str, opt["postdata"]] = "",
headers: Annotated[list[str], opt["headers"]] = [], headers: Annotated[list[str], opt["headers"]] = [],
css: Annotated[str, opt["css"]] = "",
xpath: Annotated[str, opt["xpath"]] = "",
): ):
"""Run a one-off query against the URL""" """Run a one-off query against the URL"""
header_dict = parse_headers(headers) resp, root = make_request(url, headers, postdata)
typer.echo(f"Starting interactive shell for: {url}") selector, selected = parse_selectors(root, css, xpath)
typer.echo(f"Headers: {header_dict}")
# Shell setup would go here for s in selected:
print(s)
@cli.command() @cli.command()
@ -64,38 +112,11 @@ def shell(
xpath: Annotated[str, opt["xpath"]] = "", xpath: Annotated[str, opt["xpath"]] = "",
): ):
"""Launch an interactive Python shell for scraping""" """Launch an interactive Python shell for scraping"""
pyversion = sys.version.split(" ")[0]
header_dict = parse_headers(headers)
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
if resp.headers["content-type"] == "text/html":
root = lxml.html.fromstring(resp.text)
# check for a selector resp, root = make_request(url, headers, postdata)
selected = selector = None selector, selected = parse_selectors(root, css, xpath)
if css and xpath:
typer.secho("Cannot specify css and xpath", fg="red")
raise typer.Exit(1)
if css:
selector = css
selected = root.cssselect(css)
if xpath:
selector = xpath
selected = root.xpath(xpath)
console = Console() console = Console()
console.print(
Panel(
f"""
W H H H SS K K
W W W HHH S KK
WWWWW H H SS K K {VERSION}
""".lstrip()
+ "\n"
f"ipython {IPython.__version__} | python {pyversion}",
style="cyan",
expand=False,
)
)
table = Table( table = Table(
title="variables", title="variables",
show_header=False, show_header=False,