readme
This commit is contained in:
parent
e6abe68e6e
commit
a5da7840b5
85
README.md
85
README.md
@ -0,0 +1,85 @@
|
||||
# whsk
|
||||
|
||||
**whsk** is a command line utility for web scraper authors.
|
||||
|
||||
It provides a set of utilities for inspecting HTML responses, and applying selectors against them.
|
||||
|
||||
## Installation
|
||||
|
||||
It is recommended you install whsk with `uvx` or `pipx`:
|
||||
|
||||
`uvx whsk` is the fastest way to get running with `whsk`
|
||||
|
||||
It currently consists of two utilities:
|
||||
|
||||
## whsk shell
|
||||
|
||||
`whsk shell` fetches a page, automatically parsing HTML, XML, or JSON responses.
|
||||
It then opens an `ipython` shell allowing you to interact with the raw and parsed response.
|
||||
|
||||
When the command runs it will print a table of the variables it has loaded (which will depend on the type of page and particular flags passed):
|
||||
|
||||
```
|
||||
variables
|
||||
┌──────────┬───────────────────────┐
|
||||
│ url │ https://example.com │
|
||||
│ resp │ <Response [200 OK]> │
|
||||
│ root │ lxml.html.HtmlElement │
|
||||
│ selector │ //p │
|
||||
│ selected │ 2 elements │
|
||||
└──────────┴───────────────────────┘
|
||||
|
||||
In [1]:
|
||||
```
|
||||
|
||||
The `In[1]`: is an `ipython` prompt, the variables in the table area available for inspection & usage.
|
||||
|
||||
### Options
|
||||
|
||||
```
|
||||
Usage: whsk shell [OPTIONS] URL
|
||||
|
||||
Launch an interactive Python shell for scraping
|
||||
|
||||
╭─ Arguments ──────────────────────────────────────────────────────────────────────────╮
|
||||
│ * url TEXT URL to scrape [default: None] [required] │
|
||||
╰──────────────────────────────────────────────────────────────────────────────────────╯
|
||||
╭─ Options ────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --ua TEXT User agent to make requests with │
|
||||
│ --postdata -p TEXT POST data (will make a POST instead of GET) │
|
||||
│ --header -h TEXT Additional headers in format 'Name: Value' │
|
||||
│ --css -c TEXT css selector │
|
||||
│ --xpath -x TEXT xpath selector │
|
||||
│ --help Show this message and exit. │
|
||||
╰──────────────────────────────────────────────────────────────────────────────────────╯
|
||||
```
|
||||
|
||||
## whsk query
|
||||
|
||||
`whsk query` takes the same command line options as `whsk shell` but instead of opening a shell
|
||||
will output the results of the `--css` or `--xpath` selection, and then exit immediately.
|
||||
|
||||
As such, you must provide *one* of the two selector parameters.
|
||||
|
||||
This can be used for rapid testing of queries without opening the shell each time.
|
||||
|
||||
### Options
|
||||
|
||||
```
|
||||
Usage: whsk query [OPTIONS] URL
|
||||
|
||||
Run a one-off query against the URL
|
||||
|
||||
╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ * url TEXT URL to scrape [default: None] [required] │
|
||||
╰─────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────╮
|
||||
│ --ua TEXT User agent to make requests with │
|
||||
│ --postdata -p TEXT POST data (will make a POST instead of GET) │
|
||||
│ --header -h TEXT Additional headers in format 'Name: Value' │
|
||||
│ --css -c TEXT css selector │
|
||||
│ --xpath -x TEXT xpath selector │
|
||||
│ --help Show this message and exit. │
|
||||
╰─────────────────────────────────────────────────────────────────────────────────────────────────────╯
|
||||
|
||||
```
|
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "whsk"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
description = "web/html scraping toolkit"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
|
4
uv.lock
generated
4
uv.lock
generated
@ -39,7 +39,7 @@ name = "click"
|
||||
version = "8.1.8"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "colorama", marker = "platform_system == 'Windows'" },
|
||||
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 }
|
||||
wheels = [
|
||||
@ -436,7 +436,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "whsk"
|
||||
version = "0.1.0"
|
||||
version = "0.2.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "cssselect" },
|
||||
|
Loading…
Reference in New Issue
Block a user