scrapple/examples/quickstart.py
2021-06-05 01:47:40 -04:00

50 lines
1.7 KiB
Python

# imports we'll use in this example
from spatula import HtmlPage, HtmlListPage, CSS, XPath, SelectorError
class EmployeeList(HtmlListPage):
# by providing this here, it can be omitted on the command line
# useful in cases where the scraper is only meant for one page
source = "https://yoyodyne-propulsion.herokuapp.com/staff"
source = "http://localhost:5000/staff"
# each row represents an employee
selector = CSS("#employees tbody tr")
def process_item(self, item):
# this function is called for each <tr> we get from the selector
# we know there are 4 <tds>
first, last, position, details = item.getchildren()
return EmployeeDetail(
dict(
first=first.text,
last=last.text,
position=position.text,
),
source=XPath("./a/@href").match_one(details),
)
def get_next_source(self):
try:
return XPath("//a[contains(text(), 'Next')]/@href").match_one(self.root)
except SelectorError:
pass
class EmployeeDetail(HtmlPage):
def process_page(self):
marital_status = CSS("#status").match_one(self.root)
children = CSS("#children").match_one(self.root)
hired = CSS("#hired").match_one(self.root)
return dict(
marital_status=marital_status.text,
children=children.text,
hired=hired.text,
# self.input is the data passed in from the prior scrape,
# in this case a dict we can expand here
**self.input,
)
def process_error_response(self, exception):
self.logger.warning(exception)