conversion to parks

This commit is contained in:
James Turk 2022-12-07 00:11:21 -06:00
parent 1385722b10
commit f95f2e03c1
12 changed files with 137 additions and 1159 deletions

View File

@ -1,3 +1,4 @@
Sample site for web scraping practice.
Visit https://yoyodyne-propulsion.fly.dev/ to see the site. Visit https://yoyodyne-propulsion.fly.dev/ to see the site.
This is a sample site for [spatula](https://jamesturk.github.io/spatula/) docs.

100
app.py
View File

@ -1,19 +1,18 @@
import csv import json
import math import math
from flask import Flask, render_template, abort, request from flask import Flask, render_template, abort, request
app = Flask(__name__) app = Flask(__name__)
_employees = {} _data = {}
def employees(): def parks():
# thanks to http://www.figmentfly.com/bb/badguys3.html for names global _data
global _employees if not _data:
if not _employees: with open("data/parks.json") as f:
with open("data/employees.csv") as f: _data = {p["id"]: p for p in json.load(f)}
_employees = {e["id"]: e for e in csv.DictReader(f)} return _data
return _employees
@app.route("/") @app.route("/")
@ -26,94 +25,27 @@ def about():
return render_template("about.html") return render_template("about.html")
@app.route("/awards") @app.route("/parks")
def awards():
award_data = [
{
"name": "Nobel Prize in Physics",
"year": "1934",
"for": "Discovery of the 8th Dimension",
"to": "John Whorfin",
},
{
"name": "Cousteau Society Award",
"year": "1989",
"for": "Uses of Cephalopod Intelligence",
"to": "John Fish",
},
{
"name": "Best Supporting Actor",
"year": "1985",
"for": "John Lithgow Biopic",
"to": "John Whorfin",
},
{
"name": "Nobel Prize in Physics",
"year": "2986",
"for": "Temporal Paradox Resolution",
"to": "John O'Connor",
},
{
"name": "Paralegal of the Year",
"year": "1999",
"for": "Paralegal Activity",
"to": "John Two Horns",
},
{
"name": "ACM Award",
"year": "1986",
"for": "Innovations in User Interface",
"to": "John Ya Ya",
},
{
"name": "2nd Place, Most Jars Category",
"year": "1987",
"for": "Jars",
"to": "John Many Jars",
},
{
"name": "Album of the Year",
"year": "1997",
"for": "Space Coyote",
"to": "John Coyote",
},
{
"name": "Most Creative Loophole",
"year": "1985",
"for": "Innovation in Interdimensional Tax Shelters",
"to": "John Lee",
},
]
return render_template("awards.html", awards=award_data)
@app.route("/staff")
def staff(): def staff():
page = int(request.args.get("page", 1)) page = int(request.args.get("page", 1))
per_page = 10 per_page = 10
total_items = len(employees()) total_items = len(parks())
max_page = math.ceil(total_items / per_page) max_page = math.ceil(total_items / per_page)
print(max_page)
if page < 1 or page > max_page: if page < 1 or page > max_page:
abort(404) abort(404)
page_employees = list(employees().values())[(page - 1) * per_page : page * per_page] page_parks = list(parks().values())[(page - 1) * per_page : page * per_page]
return render_template( return render_template(
"staff.html", "parks.html",
employees=page_employees, parks=page_parks,
page=page, page=page,
prev_page=page - 1, prev_page=page - 1,
next_page=page + 1 if page < max_page else None, next_page=page + 1 if page < max_page else None,
) )
@app.route("/staff/<id_>") @app.route("/parks/<id_>")
def staff_detail(id_): def park_detail(id_):
if id_ == "404": if park := parks().get(id_):
abort(
404,
"This page intentionally left blank. (No really! This is an intentional error for demonstration purposes.)",
)
if employee := employees().get(id_):
return render_template("staff_detail.html", employee=employee) return render_template("staff_detail.html", employee=employee)
else: else:
abort(404) abort(404)

View File

@ -1,46 +0,0 @@
id,first,last,position,status,children,hired
52,John,Barnett,Scheduling,Married,1,3/6/1963
2,John,Bigbooté,Executive Vice President,Single,0,10/31/1938
13,John,Camp,Human Resources,Single,0,6/12/1985
14,John,Careful Walker,Accounting,Single,0,4/30/1990
15,John,Chief Crier,"VP, Public Relations",Married,2,4/2/1980
16,John,Cooper,Storage Acquisition,Single,0,8/15/1984
17,John,Coyote,Medical Research,Married,6,1/1/1970
18,John,Edwards,Public Relations,Divorced,2,8/15/1984
19,John,Fat Eating,Craft Services,Single,0,8/15/1984
20,John,Fish,Marine R&D,Divorced,12,10/31/1938
21,John,Fledgling,Human Resources,Married,2,7/4/1976
22,John,Gomez,"VP, Sales",Married,1,8/15/1984
23,John,Grim,Actuary,Married,1.8,9/9/1970
24,John,Guardian,Optical Systems Engineer,Single,0,6/9/1980
25,John,Icicle Boy,Refrigeration R&D,Single,0,2/6/1939
404,John,Jones,Government Relations,Married,3,8/15/1984
27,John,Joseph,Government Relations,Married,2,10/31/1938
28,John,Kim Chi,Craft Services,Married,2,7/24/1990
29,John,Lee,Accounting,Single,0,12/14/1988
30,John,Littlejohn,Staff Parliamentarian,Divorced,0,10/31/1938
31,John,Many Jars,Storage Acquisition Lead,Married,3,1/8/1960
666,John,Milton,Chief Counsel,Single,2,6/6/1966
32,John,Mud Head,Apian Research Lead,Married,7,1/2/1956
33,John,Nephew,Temporal Paradox Resolution,Widowed,0,1/20/1474
34,John,Nolan,Custodian,Married,2,10/31/1938
35,John,O'Connor,Temporal Paradox Resolution,Widowed,1,5/15/2022
37,John,Omar,Imports & Exports,Single,0,9/9/1963
38,John,Parrot,Computer Design Specialist,Married,1,10/1/1970
39,John,Rajeesh,Human Resources,Divorced,1,6/12/1982
11,John,Ready to Fly,Aerial R&D,Married,1,7/4/1976
40,John,Repeat Dance,Sales,Married,3,6/8/1949
50,John,Roberts,Counsel,Married,0,4/1/1962
51,John,Scott,Administration,Single,1,10/31/1938
59,John,Shaw,Administration,Single,0,8/15/1984
3,John,Small Berries,Orbital Ergonomitrics Team Leader,Married,3,10/17/1948
4,John,Starbird,Orbital Ergonomitrics,Married,2,10/18/1948
5,John,Take Cover,Security Administrator,Divorced,0,2/15/2019
6,John,Thorny Stick,Sales,Married,0,8/15/1984
7,John,Turk,System Administrator,Engaged,0,11/3/1986
8,John,Two Horns,Paralegal,Married,0,8/15/1984
9,John,Web,IT Support,Married,1,6/8/1949
1,John,Whorfin,CEO / Lord,Single,0,10/31/1938
99,John,Wood,Sales,Married,1,8/12/1948
100,John,Wright,Orbital Mechanics Supervisor,Married,2,10/4/1985
101,John,Ya Ya,Computer Design Specialist,Married,0,10/31/1938
1 id first last position status children hired
2 52 John Barnett Scheduling Married 1 3/6/1963
3 2 John Bigbooté Executive Vice President Single 0 10/31/1938
4 13 John Camp Human Resources Single 0 6/12/1985
5 14 John Careful Walker Accounting Single 0 4/30/1990
6 15 John Chief Crier VP, Public Relations Married 2 4/2/1980
7 16 John Cooper Storage Acquisition Single 0 8/15/1984
8 17 John Coyote Medical Research Married 6 1/1/1970
9 18 John Edwards Public Relations Divorced 2 8/15/1984
10 19 John Fat Eating Craft Services Single 0 8/15/1984
11 20 John Fish Marine R&D Divorced 12 10/31/1938
12 21 John Fledgling Human Resources Married 2 7/4/1976
13 22 John Gomez VP, Sales Married 1 8/15/1984
14 23 John Grim Actuary Married 1.8 9/9/1970
15 24 John Guardian Optical Systems Engineer Single 0 6/9/1980
16 25 John Icicle Boy Refrigeration R&D Single 0 2/6/1939
17 404 John Jones Government Relations Married 3 8/15/1984
18 27 John Joseph Government Relations Married 2 10/31/1938
19 28 John Kim Chi Craft Services Married 2 7/24/1990
20 29 John Lee Accounting Single 0 12/14/1988
21 30 John Littlejohn Staff Parliamentarian Divorced 0 10/31/1938
22 31 John Many Jars Storage Acquisition Lead Married 3 1/8/1960
23 666 John Milton Chief Counsel Single 2 6/6/1966
24 32 John Mud Head Apian Research Lead Married 7 1/2/1956
25 33 John Nephew Temporal Paradox Resolution Widowed 0 1/20/1474
26 34 John Nolan Custodian Married 2 10/31/1938
27 35 John O'Connor Temporal Paradox Resolution Widowed 1 5/15/2022
28 37 John Omar Imports & Exports Single 0 9/9/1963
29 38 John Parrot Computer Design Specialist Married 1 10/1/1970
30 39 John Rajeesh Human Resources Divorced 1 6/12/1982
31 11 John Ready to Fly Aerial R&D Married 1 7/4/1976
32 40 John Repeat Dance Sales Married 3 6/8/1949
33 50 John Roberts Counsel Married 0 4/1/1962
34 51 John Scott Administration Single 1 10/31/1938
35 59 John Shaw Administration Single 0 8/15/1984
36 3 John Small Berries Orbital Ergonomitrics Team Leader Married 3 10/17/1948
37 4 John Starbird Orbital Ergonomitrics Married 2 10/18/1948
38 5 John Take Cover Security Administrator Divorced 0 2/15/2019
39 6 John Thorny Stick Sales Married 0 8/15/1984
40 7 John Turk System Administrator Engaged 0 11/3/1986
41 8 John Two Horns Paralegal Married 0 8/15/1984
42 9 John Web IT Support Married 1 6/8/1949
43 1 John Whorfin CEO / Lord Single 0 10/31/1938
44 99 John Wood Sales Married 1 8/12/1948
45 100 John Wright Orbital Mechanics Supervisor Married 2 10/4/1985
46 101 John Ya Ya Computer Design Specialist Married 0 10/31/1938

1
data/parks.json Normal file

File diff suppressed because one or more lines are too long

View File

@ -1,49 +0,0 @@
# imports we'll use in this example
from spatula import HtmlPage, HtmlListPage, CSS, XPath, SelectorError
class EmployeeList(HtmlListPage):
# by providing this here, it can be omitted on the command line
# useful in cases where the scraper is only meant for one page
source = "https://yoyodyne-propulsion.herokuapp.com/staff"
source = "http://localhost:5000/staff"
# each row represents an employee
selector = CSS("#employees tbody tr")
def process_item(self, item):
# this function is called for each <tr> we get from the selector
# we know there are 4 <tds>
first, last, position, details = item.getchildren()
return EmployeeDetail(
dict(
first=first.text,
last=last.text,
position=position.text,
),
source=XPath("./a/@href").match_one(details),
)
def get_next_source(self):
try:
return XPath("//a[contains(text(), 'Next')]/@href").match_one(self.root)
except SelectorError:
pass
class EmployeeDetail(HtmlPage):
def process_page(self):
marital_status = CSS("#status").match_one(self.root)
children = CSS("#children").match_one(self.root)
hired = CSS("#hired").match_one(self.root)
return dict(
marital_status=marital_status.text,
children=children.text,
hired=hired.text,
# self.input is the data passed in from the prior scrape,
# in this case a dict we can expand here
**self.input,
)
def process_error_response(self, exception):
self.logger.warning(exception)

1033
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,6 @@ authors = ["James Turk <dev@jamesturk.net>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.9" python = "^3.9"
Flask = "^2.0.1" Flask = "^2.0.1"
spatula = "^0.6.0"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]

View File

@ -7,17 +7,18 @@
</h2> </h2>
<div class="section"> <div class="section">
<p> <p>
This site exists to demonstrate some basic scraping techniques, as well as provide examples of real world challenges that scraper authors often encounter. It was primarily written to assist with examples and documentation for <a href="https://github.com/jamesturk/spatula">spatula</a> but feel free to use it for whatever you want. Just be nice, the site is hosted on a free <a href="https://fly.io">fly.io</a> VM. This site is generated from data scraped from <a href="https://www.chicagoparkdistrict.com/about-us/history-chicagos-parks">Chicago Park District</a>. It is not meant to be authoritative and is not kept up to date.
<br>
This site was originally created as a demo for <a href="https://github.com/jamesturk/spatula">spatula</a> and updated for CAPP 30122 at the University of Chicago.
</p>
<p>
This site exists to demonstrate some basic scraping techniques, as well as provide examples of real world challenges that scraper authors often encounter. You may use it for this purpose however you see fit. Just be nice, the site is hosted on a free <a href="https://fly.io">fly.io</a> VM.
</p> </p>
<p>While other sites with a similar purpose exist, they often omit some of the common complications that can arise in scraping. The eventual goal of this site is to serve as a test suite of sorts for handling edge cases. <p>While other sites with a similar purpose exist, they often omit some of the common complications that can arise in scraping. The eventual goal of this site is to serve as a test suite of sorts for handling edge cases.
</p> </p>
<p>This site is the work of <a href="https://jamesturk.net">James Turk</a>. James has been getting paid to write web scrapers for over 10 years, mainly on the <a href="https://openstates.org">Open States</a> project. This work is created independently of any employer, past or present.</p>
<p>The full source for this site is available on <a href="https://github.com/jamesturk/yoyodyne-propulsion">GitHub</a>.
<p>If you're wondering about what all the nonsense is about... <a href="https://letterboxd.com/film/the-adventures-of-buckaroo-banzai-across-the-8th-dimension/">here you go</a>.</p>
</div> </div>
</div> </div>
{% endblock %} {% endblock %}

View File

@ -1,26 +0,0 @@
{% extends "base.html" %}
{% block base %}
<div class="card fluid">
<h1 class="section">Yoyodyne Propulsion Systems <small>Awards</small></h1>
<div class="section">
<div class="row">
{% for award in awards %}
<div class="card large">
<div class="section">
<h2>{{ award.name }} <small>{{ award.year }}</small></h2>
<dl>
<dt>For<dt>
<dd>{{ award.for }}</dd>
<dt>Awarded To</dt>
<dd>{{ award.to }}</dd>
</dl>
</div>
</div>
{% endfor %}
</div>
</div>
</div>
{% endblock %}

View File

@ -6,10 +6,9 @@
</head> </head>
<body> <body>
<header> <header>
<a href="#" class="logo">YPS</a> <a href="#" class="logo">Chicago Parks</a>
<a href="/" class="button">Home</a> <a href="/" class="button">Home</a>
<a href="/staff" class="button">Staff</a> <a href="/parks" class="button">Parks</a>
<a href="/awards" class="button">Awards</a>
<a href="/about" class="button">About</a> <a href="/about" class="button">About</a>
</header> </header>
{% block base %} {% block base %}

View File

@ -2,11 +2,10 @@
{% block base %} {% block base %}
<div class="card fluid"> <div class="card fluid">
<h1 class="section">Yoyodyne Propulsion Systems <small>The future begins tomorrow</small></h1> <h1 class="section">Chicago Parks</h1>
<div class="section"> <div class="section">
<p>Welcome to Yoyodyne Propulsion Systems, where the future begins tomorrow.</p>
<p> <p>
Since 1938 we have been producing key technological components for the success of our great nation, from our home office in New Jersey. One of America's best-kept secrets is Chicago's historic park system. Even Chicagoans who routinely enjoy its diverse open spaces- from the magnificent lakeshore parks to intimate neighborhood settings- may be surprised about their parkland legacy. We invite you to learn more about the history of Chicago parks, which are second to none in America and abroad.
</p> </p>
</div> </div>
</div> </div>

View File

@ -2,23 +2,21 @@
{% block base %} {% block base %}
<div class="card fluid"> <div class="card fluid">
<h1 class="section">Yoyodyne Propulsion Systems <small>Staff Roster</small></h1> <h1 class="section">Chicago Parks<small>List of Parks</small></h1>
<div class="section"> <div class="section">
<table id="employees" style="max-height: 100%;"> <table id="employees" style="max-height: 100%;">
<thead> <thead>
<tr> <tr>
<th>First Name</th> <th>Name</th>
<th>Last Name</th> <th>Location</th>
<th>Position Name</th>
<th>&nbsp;</th> <th>&nbsp;</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for e in employees %} {% for e in parks %}
<tr> <tr>
<td>{{ e.first }}</td> <td>{{ e.title }}</td>
<td>{{ e.last }}</td> <td>{{ e.address }}</td>
<td>{{ e.position }}</td>
<td><a href="/staff/{{ e.id }}">Details</a></td> <td><a href="/staff/{{ e.id }}">Details</a></td>
</tr> </tr>
{% endfor %} {% endfor %}