v 0.1
This commit is contained in:
parent
766dc5d8e8
commit
123f12257a
@ -14,6 +14,7 @@ cli = typer.Typer(help="whsk: web harvesting/scraping toolKit")
|
||||
VERSION = version("whsk")
|
||||
_user_agent = f"whsk/{VERSION}"
|
||||
|
||||
# Common Options
|
||||
opt = {
|
||||
"user_agent": typer.Option("--ua", help="User agent to make requests with"),
|
||||
"postdata": typer.Option(
|
||||
@ -40,18 +41,65 @@ def parse_headers(headers: list[str]) -> dict:
|
||||
return header_dict
|
||||
|
||||
|
||||
def make_request(url, headers, postdata):
|
||||
header_dict = parse_headers(headers)
|
||||
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
|
||||
# if resp.headers["content-type"] == "text/html":
|
||||
root = lxml.html.fromstring(resp.text)
|
||||
return resp, root
|
||||
|
||||
|
||||
def parse_selectors(root, css, xpath):
|
||||
# check for a selector
|
||||
selected = selector = None
|
||||
if css and xpath:
|
||||
typer.secho("Cannot specify css and xpath", fg="red")
|
||||
raise typer.Exit(1)
|
||||
if css:
|
||||
selector = css
|
||||
selected = root.cssselect(css)
|
||||
if xpath:
|
||||
selector = xpath
|
||||
selected = root.xpath(xpath)
|
||||
return selector, selected
|
||||
|
||||
|
||||
@cli.command()
|
||||
def version():
|
||||
pyversion = sys.version.split(" ")[0]
|
||||
console = Console()
|
||||
console.print(
|
||||
Panel(
|
||||
f"""
|
||||
W H H H SS K K
|
||||
W W W HHH S KK
|
||||
WWWWW H H SS K K v{VERSION}
|
||||
""".lstrip()
|
||||
+ f"\npython {pyversion:>23}"
|
||||
f"\nipython {IPython.__version__:>22}"
|
||||
f"\nlxml.html {lxml.__version__:>20}"
|
||||
f"\nhttpx {httpx.__version__:>24}",
|
||||
style="cyan",
|
||||
expand=False,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@cli.command()
|
||||
def query(
|
||||
url: Annotated[str, typer.Argument(help="URL to scrape")],
|
||||
user_agent: Annotated[str, opt["user_agent"]] = _user_agent,
|
||||
postdata: Annotated[str, opt["postdata"]] = "",
|
||||
headers: Annotated[list[str], opt["headers"]] = [],
|
||||
css: Annotated[str, opt["css"]] = "",
|
||||
xpath: Annotated[str, opt["xpath"]] = "",
|
||||
):
|
||||
"""Run a one-off query against the URL"""
|
||||
header_dict = parse_headers(headers)
|
||||
typer.echo(f"Starting interactive shell for: {url}")
|
||||
typer.echo(f"Headers: {header_dict}")
|
||||
# Shell setup would go here
|
||||
resp, root = make_request(url, headers, postdata)
|
||||
selector, selected = parse_selectors(root, css, xpath)
|
||||
|
||||
for s in selected:
|
||||
print(s)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@ -64,38 +112,11 @@ def shell(
|
||||
xpath: Annotated[str, opt["xpath"]] = "",
|
||||
):
|
||||
"""Launch an interactive Python shell for scraping"""
|
||||
pyversion = sys.version.split(" ")[0]
|
||||
header_dict = parse_headers(headers)
|
||||
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
|
||||
if resp.headers["content-type"] == "text/html":
|
||||
root = lxml.html.fromstring(resp.text)
|
||||
|
||||
# check for a selector
|
||||
selected = selector = None
|
||||
if css and xpath:
|
||||
typer.secho("Cannot specify css and xpath", fg="red")
|
||||
raise typer.Exit(1)
|
||||
if css:
|
||||
selector = css
|
||||
selected = root.cssselect(css)
|
||||
if xpath:
|
||||
selector = xpath
|
||||
selected = root.xpath(xpath)
|
||||
resp, root = make_request(url, headers, postdata)
|
||||
selector, selected = parse_selectors(root, css, xpath)
|
||||
|
||||
console = Console()
|
||||
console.print(
|
||||
Panel(
|
||||
f"""
|
||||
W H H H SS K K
|
||||
W W W HHH S KK
|
||||
WWWWW H H SS K K {VERSION}
|
||||
""".lstrip()
|
||||
+ "\n"
|
||||
f"ipython {IPython.__version__} | python {pyversion}",
|
||||
style="cyan",
|
||||
expand=False,
|
||||
)
|
||||
)
|
||||
table = Table(
|
||||
title="variables",
|
||||
show_header=False,
|
||||
|
Loading…
Reference in New Issue
Block a user