v 0.1
This commit is contained in:
parent
766dc5d8e8
commit
123f12257a
@ -14,6 +14,7 @@ cli = typer.Typer(help="whsk: web harvesting/scraping toolKit")
|
|||||||
VERSION = version("whsk")
|
VERSION = version("whsk")
|
||||||
_user_agent = f"whsk/{VERSION}"
|
_user_agent = f"whsk/{VERSION}"
|
||||||
|
|
||||||
|
# Common Options
|
||||||
opt = {
|
opt = {
|
||||||
"user_agent": typer.Option("--ua", help="User agent to make requests with"),
|
"user_agent": typer.Option("--ua", help="User agent to make requests with"),
|
||||||
"postdata": typer.Option(
|
"postdata": typer.Option(
|
||||||
@ -40,18 +41,65 @@ def parse_headers(headers: list[str]) -> dict:
|
|||||||
return header_dict
|
return header_dict
|
||||||
|
|
||||||
|
|
||||||
|
def make_request(url, headers, postdata):
|
||||||
|
header_dict = parse_headers(headers)
|
||||||
|
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
|
||||||
|
# if resp.headers["content-type"] == "text/html":
|
||||||
|
root = lxml.html.fromstring(resp.text)
|
||||||
|
return resp, root
|
||||||
|
|
||||||
|
|
||||||
|
def parse_selectors(root, css, xpath):
|
||||||
|
# check for a selector
|
||||||
|
selected = selector = None
|
||||||
|
if css and xpath:
|
||||||
|
typer.secho("Cannot specify css and xpath", fg="red")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
if css:
|
||||||
|
selector = css
|
||||||
|
selected = root.cssselect(css)
|
||||||
|
if xpath:
|
||||||
|
selector = xpath
|
||||||
|
selected = root.xpath(xpath)
|
||||||
|
return selector, selected
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def version():
|
||||||
|
pyversion = sys.version.split(" ")[0]
|
||||||
|
console = Console()
|
||||||
|
console.print(
|
||||||
|
Panel(
|
||||||
|
f"""
|
||||||
|
W H H H SS K K
|
||||||
|
W W W HHH S KK
|
||||||
|
WWWWW H H SS K K v{VERSION}
|
||||||
|
""".lstrip()
|
||||||
|
+ f"\npython {pyversion:>23}"
|
||||||
|
f"\nipython {IPython.__version__:>22}"
|
||||||
|
f"\nlxml.html {lxml.__version__:>20}"
|
||||||
|
f"\nhttpx {httpx.__version__:>24}",
|
||||||
|
style="cyan",
|
||||||
|
expand=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
def query(
|
def query(
|
||||||
url: Annotated[str, typer.Argument(help="URL to scrape")],
|
url: Annotated[str, typer.Argument(help="URL to scrape")],
|
||||||
user_agent: Annotated[str, opt["user_agent"]] = _user_agent,
|
user_agent: Annotated[str, opt["user_agent"]] = _user_agent,
|
||||||
postdata: Annotated[str, opt["postdata"]] = "",
|
postdata: Annotated[str, opt["postdata"]] = "",
|
||||||
headers: Annotated[list[str], opt["headers"]] = [],
|
headers: Annotated[list[str], opt["headers"]] = [],
|
||||||
|
css: Annotated[str, opt["css"]] = "",
|
||||||
|
xpath: Annotated[str, opt["xpath"]] = "",
|
||||||
):
|
):
|
||||||
"""Run a one-off query against the URL"""
|
"""Run a one-off query against the URL"""
|
||||||
header_dict = parse_headers(headers)
|
resp, root = make_request(url, headers, postdata)
|
||||||
typer.echo(f"Starting interactive shell for: {url}")
|
selector, selected = parse_selectors(root, css, xpath)
|
||||||
typer.echo(f"Headers: {header_dict}")
|
|
||||||
# Shell setup would go here
|
for s in selected:
|
||||||
|
print(s)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@ -64,38 +112,11 @@ def shell(
|
|||||||
xpath: Annotated[str, opt["xpath"]] = "",
|
xpath: Annotated[str, opt["xpath"]] = "",
|
||||||
):
|
):
|
||||||
"""Launch an interactive Python shell for scraping"""
|
"""Launch an interactive Python shell for scraping"""
|
||||||
pyversion = sys.version.split(" ")[0]
|
|
||||||
header_dict = parse_headers(headers)
|
|
||||||
resp = httpx.request("GET", url, headers=header_dict, data=postdata)
|
|
||||||
if resp.headers["content-type"] == "text/html":
|
|
||||||
root = lxml.html.fromstring(resp.text)
|
|
||||||
|
|
||||||
# check for a selector
|
resp, root = make_request(url, headers, postdata)
|
||||||
selected = selector = None
|
selector, selected = parse_selectors(root, css, xpath)
|
||||||
if css and xpath:
|
|
||||||
typer.secho("Cannot specify css and xpath", fg="red")
|
|
||||||
raise typer.Exit(1)
|
|
||||||
if css:
|
|
||||||
selector = css
|
|
||||||
selected = root.cssselect(css)
|
|
||||||
if xpath:
|
|
||||||
selector = xpath
|
|
||||||
selected = root.xpath(xpath)
|
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
console.print(
|
|
||||||
Panel(
|
|
||||||
f"""
|
|
||||||
W H H H SS K K
|
|
||||||
W W W HHH S KK
|
|
||||||
WWWWW H H SS K K {VERSION}
|
|
||||||
""".lstrip()
|
|
||||||
+ "\n"
|
|
||||||
f"ipython {IPython.__version__} | python {pyversion}",
|
|
||||||
style="cyan",
|
|
||||||
expand=False,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
table = Table(
|
table = Table(
|
||||||
title="variables",
|
title="variables",
|
||||||
show_header=False,
|
show_header=False,
|
||||||
|
Loading…
Reference in New Issue
Block a user