diff --git a/src/whsk/__init__.py b/src/whsk/__init__.py index 3fd687b..fb93b40 100644 --- a/src/whsk/__init__.py +++ b/src/whsk/__init__.py @@ -8,11 +8,12 @@ from rich.console import Console from rich.panel import Panel from typing_extensions import Annotated from importlib.metadata import version +from .user_agents import USER_AGENTS cli = typer.Typer(help="whsk: web harvesting/scraping toolKit") VERSION = version("whsk") -_user_agent = f"whsk/{VERSION}" +_default_user_agent = f"whsk/{VERSION}" # Common Options opt = { @@ -54,6 +55,21 @@ def make_request(url, *, headers, user_agent, postdata): - lxml.etree.Element """ header_dict = parse_headers(headers) + + # user agent either from headers, shortcut, or default + if "user-agent" in headers and user_agent: + typer.secho("Cannot use --ua shortcut and also pass --header User-Agent") + raise typer.Exit(1) + elif "user-agent" in header_dict: + pass # make no changes + elif not user_agent: + header_dict["user-agent"] = _default_user_agent + elif user_agent in USER_AGENTS: + header_dict["user-agent"] = USER_AGENTS[user_agent] + else: + typer.secho("--ua shortcut must be one of: " + ", ".join(USER_AGENTS)) + raise typer.Exit(1) + method = "GET" if postdata: method = "POST" @@ -106,7 +122,7 @@ WWWWW H H SS K K v{VERSION} @cli.command() def query( url: Annotated[str, typer.Argument(help="URL to scrape")], - user_agent: Annotated[str, opt["user_agent"]] = _user_agent, + user_agent: Annotated[str, opt["user_agent"]] = "", postdata: Annotated[str, opt["postdata"]] = "", headers: Annotated[list[str], opt["headers"]] = [], css: Annotated[str, opt["css"]] = "", @@ -132,7 +148,7 @@ def query( @cli.command() def shell( url: Annotated[str, typer.Argument(help="URL to scrape")], - user_agent: Annotated[str, opt["user_agent"]] = _user_agent, + user_agent: Annotated[str, opt["user_agent"]] = "", postdata: Annotated[str, opt["postdata"]] = "", headers: Annotated[list[str], opt["headers"]] = [], css: Annotated[str, opt["css"]] = "", diff --git a/src/whsk/user_agents.py b/src/whsk/user_agents.py new file mode 100644 index 0000000..2d45b1c --- /dev/null +++ b/src/whsk/user_agents.py @@ -0,0 +1,11 @@ +# based on the list from https://www.useragents.me +USER_AGENTS = { + "linux.chrome": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.3", + "linux.firefox": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/133", + "mac.chrome": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.3", + "mac.firefox": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) Gecko/20100101 Firefox/133", + "mac.safari": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.1", + "win.chrome": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.3", + "win.edge": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.", + "win.firefox": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133", +}