user-agent flag

This commit is contained in:
James Turk 2025-01-26 11:05:50 -06:00
parent 61be304871
commit e6abe68e6e
2 changed files with 30 additions and 3 deletions

View File

@ -8,11 +8,12 @@ from rich.console import Console
from rich.panel import Panel
from typing_extensions import Annotated
from importlib.metadata import version
from .user_agents import USER_AGENTS
cli = typer.Typer(help="whsk: web harvesting/scraping toolKit")
VERSION = version("whsk")
_user_agent = f"whsk/{VERSION}"
_default_user_agent = f"whsk/{VERSION}"
# Common Options
opt = {
@ -54,6 +55,21 @@ def make_request(url, *, headers, user_agent, postdata):
- lxml.etree.Element
"""
header_dict = parse_headers(headers)
# user agent either from headers, shortcut, or default
if "user-agent" in headers and user_agent:
typer.secho("Cannot use --ua shortcut and also pass --header User-Agent")
raise typer.Exit(1)
elif "user-agent" in header_dict:
pass # make no changes
elif not user_agent:
header_dict["user-agent"] = _default_user_agent
elif user_agent in USER_AGENTS:
header_dict["user-agent"] = USER_AGENTS[user_agent]
else:
typer.secho("--ua shortcut must be one of: " + ", ".join(USER_AGENTS))
raise typer.Exit(1)
method = "GET"
if postdata:
method = "POST"
@ -106,7 +122,7 @@ WWWWW H H SS K K v{VERSION}
@cli.command()
def query(
url: Annotated[str, typer.Argument(help="URL to scrape")],
user_agent: Annotated[str, opt["user_agent"]] = _user_agent,
user_agent: Annotated[str, opt["user_agent"]] = "",
postdata: Annotated[str, opt["postdata"]] = "",
headers: Annotated[list[str], opt["headers"]] = [],
css: Annotated[str, opt["css"]] = "",
@ -132,7 +148,7 @@ def query(
@cli.command()
def shell(
url: Annotated[str, typer.Argument(help="URL to scrape")],
user_agent: Annotated[str, opt["user_agent"]] = _user_agent,
user_agent: Annotated[str, opt["user_agent"]] = "",
postdata: Annotated[str, opt["postdata"]] = "",
headers: Annotated[list[str], opt["headers"]] = [],
css: Annotated[str, opt["css"]] = "",

11
src/whsk/user_agents.py Normal file
View File

@ -0,0 +1,11 @@
# based on the list from https://www.useragents.me
USER_AGENTS = {
"linux.chrome": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.3",
"linux.firefox": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/133",
"mac.chrome": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.3",
"mac.firefox": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) Gecko/20100101 Firefox/133",
"mac.safari": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.1",
"win.chrome": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.3",
"win.edge": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.",
"win.firefox": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133",
}