diff --git a/README.md b/README.md index e69de29..619f6e4 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,85 @@ +# whsk + +**whsk** is a command line utility for web scraper authors. + +It provides a set of utilities for inspecting HTML responses, and applying selectors against them. + +## Installation + +It is recommended you install whsk with `uvx` or `pipx`: + +`uvx whsk` is the fastest way to get running with `whsk` + +It currently consists of two utilities: + +## whsk shell + +`whsk shell` fetches a page, automatically parsing HTML, XML, or JSON responses. +It then opens an `ipython` shell allowing you to interact with the raw and parsed response. + +When the command runs it will print a table of the variables it has loaded (which will depend on the type of page and particular flags passed): + +``` + variables +┌──────────┬───────────────────────┐ +│ url │ https://example.com │ +│ resp │ │ +│ root │ lxml.html.HtmlElement │ +│ selector │ //p │ +│ selected │ 2 elements │ +└──────────┴───────────────────────┘ + +In [1]: +``` + +The `In[1]`: is an `ipython` prompt, the variables in the table area available for inspection & usage. + +### Options + +``` + Usage: whsk shell [OPTIONS] URL + + Launch an interactive Python shell for scraping + +╭─ Arguments ──────────────────────────────────────────────────────────────────────────╮ +│ * url TEXT URL to scrape [default: None] [required] │ +╰──────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────────────╮ +│ --ua TEXT User agent to make requests with │ +│ --postdata -p TEXT POST data (will make a POST instead of GET) │ +│ --header -h TEXT Additional headers in format 'Name: Value' │ +│ --css -c TEXT css selector │ +│ --xpath -x TEXT xpath selector │ +│ --help Show this message and exit. │ +╰──────────────────────────────────────────────────────────────────────────────────────╯ +``` + +## whsk query + +`whsk query` takes the same command line options as `whsk shell` but instead of opening a shell +will output the results of the `--css` or `--xpath` selection, and then exit immediately. + +As such, you must provide *one* of the two selector parameters. + +This can be used for rapid testing of queries without opening the shell each time. + +### Options + +``` +Usage: whsk query [OPTIONS] URL + + Run a one-off query against the URL + +╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────╮ +│ * url TEXT URL to scrape [default: None] [required] │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────╮ +│ --ua TEXT User agent to make requests with │ +│ --postdata -p TEXT POST data (will make a POST instead of GET) │ +│ --header -h TEXT Additional headers in format 'Name: Value' │ +│ --css -c TEXT css selector │ +│ --xpath -x TEXT xpath selector │ +│ --help Show this message and exit. │ +╰─────────────────────────────────────────────────────────────────────────────────────────────────────╯ + +``` diff --git a/pyproject.toml b/pyproject.toml index a3b7b91..7fc981a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "whsk" -version = "0.1.0" +version = "0.2.0" description = "web/html scraping toolkit" readme = "README.md" requires-python = ">=3.10" diff --git a/uv.lock b/uv.lock index 6bd6469..851b679 100644 --- a/uv.lock +++ b/uv.lock @@ -39,7 +39,7 @@ name = "click" version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "colorama", marker = "platform_system == 'Windows'" }, + { name = "colorama", marker = "sys_platform == 'win32'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ @@ -436,7 +436,7 @@ wheels = [ [[package]] name = "whsk" -version = "0.1.0" +version = "0.2.0" source = { editable = "." } dependencies = [ { name = "cssselect" },