diff --git a/src/whsk/__init__.py b/src/whsk/__init__.py index a3047d2..fdf61dc 100644 --- a/src/whsk/__init__.py +++ b/src/whsk/__init__.py @@ -14,6 +14,7 @@ cli = typer.Typer(help="whsk: web harvesting/scraping toolKit") VERSION = version("whsk") _user_agent = f"whsk/{VERSION}" +# Common Options opt = { "user_agent": typer.Option("--ua", help="User agent to make requests with"), "postdata": typer.Option( @@ -40,18 +41,65 @@ def parse_headers(headers: list[str]) -> dict: return header_dict +def make_request(url, headers, postdata): + header_dict = parse_headers(headers) + resp = httpx.request("GET", url, headers=header_dict, data=postdata) + # if resp.headers["content-type"] == "text/html": + root = lxml.html.fromstring(resp.text) + return resp, root + + +def parse_selectors(root, css, xpath): + # check for a selector + selected = selector = None + if css and xpath: + typer.secho("Cannot specify css and xpath", fg="red") + raise typer.Exit(1) + if css: + selector = css + selected = root.cssselect(css) + if xpath: + selector = xpath + selected = root.xpath(xpath) + return selector, selected + + +@cli.command() +def version(): + pyversion = sys.version.split(" ")[0] + console = Console() + console.print( + Panel( + f""" +W H H H SS K K +W W W HHH S KK +WWWWW H H SS K K v{VERSION} + """.lstrip() + + f"\npython {pyversion:>23}" + f"\nipython {IPython.__version__:>22}" + f"\nlxml.html {lxml.__version__:>20}" + f"\nhttpx {httpx.__version__:>24}", + style="cyan", + expand=False, + ) + ) + + @cli.command() def query( url: Annotated[str, typer.Argument(help="URL to scrape")], user_agent: Annotated[str, opt["user_agent"]] = _user_agent, postdata: Annotated[str, opt["postdata"]] = "", headers: Annotated[list[str], opt["headers"]] = [], + css: Annotated[str, opt["css"]] = "", + xpath: Annotated[str, opt["xpath"]] = "", ): """Run a one-off query against the URL""" - header_dict = parse_headers(headers) - typer.echo(f"Starting interactive shell for: {url}") - typer.echo(f"Headers: {header_dict}") - # Shell setup would go here + resp, root = make_request(url, headers, postdata) + selector, selected = parse_selectors(root, css, xpath) + + for s in selected: + print(s) @cli.command() @@ -64,38 +112,11 @@ def shell( xpath: Annotated[str, opt["xpath"]] = "", ): """Launch an interactive Python shell for scraping""" - pyversion = sys.version.split(" ")[0] - header_dict = parse_headers(headers) - resp = httpx.request("GET", url, headers=header_dict, data=postdata) - if resp.headers["content-type"] == "text/html": - root = lxml.html.fromstring(resp.text) - # check for a selector - selected = selector = None - if css and xpath: - typer.secho("Cannot specify css and xpath", fg="red") - raise typer.Exit(1) - if css: - selector = css - selected = root.cssselect(css) - if xpath: - selector = xpath - selected = root.xpath(xpath) + resp, root = make_request(url, headers, postdata) + selector, selected = parse_selectors(root, css, xpath) console = Console() - console.print( - Panel( - f""" -W H H H SS K K -W W W HHH S KK -WWWWW H H SS K K {VERSION} - """.lstrip() - + "\n" - f"ipython {IPython.__version__} | python {pyversion}", - style="cyan", - expand=False, - ) - ) table = Table( title="variables", show_header=False,