conversion to parks
This commit is contained in:
parent
1385722b10
commit
f95f2e03c1
@ -1,3 +1,4 @@
|
|||||||
|
Sample site for web scraping practice.
|
||||||
|
|
||||||
Visit https://yoyodyne-propulsion.fly.dev/ to see the site.
|
Visit https://yoyodyne-propulsion.fly.dev/ to see the site.
|
||||||
|
|
||||||
This is a sample site for [spatula](https://jamesturk.github.io/spatula/) docs.
|
|
||||||
|
100
app.py
100
app.py
@ -1,19 +1,18 @@
|
|||||||
import csv
|
import json
|
||||||
import math
|
import math
|
||||||
from flask import Flask, render_template, abort, request
|
from flask import Flask, render_template, abort, request
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
_employees = {}
|
_data = {}
|
||||||
|
|
||||||
|
|
||||||
def employees():
|
def parks():
|
||||||
# thanks to http://www.figmentfly.com/bb/badguys3.html for names
|
global _data
|
||||||
global _employees
|
if not _data:
|
||||||
if not _employees:
|
with open("data/parks.json") as f:
|
||||||
with open("data/employees.csv") as f:
|
_data = {p["id"]: p for p in json.load(f)}
|
||||||
_employees = {e["id"]: e for e in csv.DictReader(f)}
|
return _data
|
||||||
return _employees
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/")
|
@app.route("/")
|
||||||
@ -26,94 +25,27 @@ def about():
|
|||||||
return render_template("about.html")
|
return render_template("about.html")
|
||||||
|
|
||||||
|
|
||||||
@app.route("/awards")
|
@app.route("/parks")
|
||||||
def awards():
|
|
||||||
award_data = [
|
|
||||||
{
|
|
||||||
"name": "Nobel Prize in Physics",
|
|
||||||
"year": "1934",
|
|
||||||
"for": "Discovery of the 8th Dimension",
|
|
||||||
"to": "John Whorfin",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Cousteau Society Award",
|
|
||||||
"year": "1989",
|
|
||||||
"for": "Uses of Cephalopod Intelligence",
|
|
||||||
"to": "John Fish",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Best Supporting Actor",
|
|
||||||
"year": "1985",
|
|
||||||
"for": "John Lithgow Biopic",
|
|
||||||
"to": "John Whorfin",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Nobel Prize in Physics",
|
|
||||||
"year": "2986",
|
|
||||||
"for": "Temporal Paradox Resolution",
|
|
||||||
"to": "John O'Connor",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Paralegal of the Year",
|
|
||||||
"year": "1999",
|
|
||||||
"for": "Paralegal Activity",
|
|
||||||
"to": "John Two Horns",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "ACM Award",
|
|
||||||
"year": "1986",
|
|
||||||
"for": "Innovations in User Interface",
|
|
||||||
"to": "John Ya Ya",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "2nd Place, Most Jars Category",
|
|
||||||
"year": "1987",
|
|
||||||
"for": "Jars",
|
|
||||||
"to": "John Many Jars",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Album of the Year",
|
|
||||||
"year": "1997",
|
|
||||||
"for": "Space Coyote",
|
|
||||||
"to": "John Coyote",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Most Creative Loophole",
|
|
||||||
"year": "1985",
|
|
||||||
"for": "Innovation in Interdimensional Tax Shelters",
|
|
||||||
"to": "John Lee",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
return render_template("awards.html", awards=award_data)
|
|
||||||
|
|
||||||
|
|
||||||
@app.route("/staff")
|
|
||||||
def staff():
|
def staff():
|
||||||
page = int(request.args.get("page", 1))
|
page = int(request.args.get("page", 1))
|
||||||
per_page = 10
|
per_page = 10
|
||||||
total_items = len(employees())
|
total_items = len(parks())
|
||||||
max_page = math.ceil(total_items / per_page)
|
max_page = math.ceil(total_items / per_page)
|
||||||
print(max_page)
|
|
||||||
if page < 1 or page > max_page:
|
if page < 1 or page > max_page:
|
||||||
abort(404)
|
abort(404)
|
||||||
page_employees = list(employees().values())[(page - 1) * per_page : page * per_page]
|
page_parks = list(parks().values())[(page - 1) * per_page : page * per_page]
|
||||||
return render_template(
|
return render_template(
|
||||||
"staff.html",
|
"parks.html",
|
||||||
employees=page_employees,
|
parks=page_parks,
|
||||||
page=page,
|
page=page,
|
||||||
prev_page=page - 1,
|
prev_page=page - 1,
|
||||||
next_page=page + 1 if page < max_page else None,
|
next_page=page + 1 if page < max_page else None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@app.route("/staff/<id_>")
|
@app.route("/parks/<id_>")
|
||||||
def staff_detail(id_):
|
def park_detail(id_):
|
||||||
if id_ == "404":
|
if park := parks().get(id_):
|
||||||
abort(
|
|
||||||
404,
|
|
||||||
"This page intentionally left blank. (No really! This is an intentional error for demonstration purposes.)",
|
|
||||||
)
|
|
||||||
if employee := employees().get(id_):
|
|
||||||
return render_template("staff_detail.html", employee=employee)
|
return render_template("staff_detail.html", employee=employee)
|
||||||
else:
|
else:
|
||||||
abort(404)
|
abort(404)
|
||||||
|
@ -1,46 +0,0 @@
|
|||||||
id,first,last,position,status,children,hired
|
|
||||||
52,John,Barnett,Scheduling,Married,1,3/6/1963
|
|
||||||
2,John,Bigbooté,Executive Vice President,Single,0,10/31/1938
|
|
||||||
13,John,Camp,Human Resources,Single,0,6/12/1985
|
|
||||||
14,John,Careful Walker,Accounting,Single,0,4/30/1990
|
|
||||||
15,John,Chief Crier,"VP, Public Relations",Married,2,4/2/1980
|
|
||||||
16,John,Cooper,Storage Acquisition,Single,0,8/15/1984
|
|
||||||
17,John,Coyote,Medical Research,Married,6,1/1/1970
|
|
||||||
18,John,Edwards,Public Relations,Divorced,2,8/15/1984
|
|
||||||
19,John,Fat Eating,Craft Services,Single,0,8/15/1984
|
|
||||||
20,John,Fish,Marine R&D,Divorced,12,10/31/1938
|
|
||||||
21,John,Fledgling,Human Resources,Married,2,7/4/1976
|
|
||||||
22,John,Gomez,"VP, Sales",Married,1,8/15/1984
|
|
||||||
23,John,Grim,Actuary,Married,1.8,9/9/1970
|
|
||||||
24,John,Guardian,Optical Systems Engineer,Single,0,6/9/1980
|
|
||||||
25,John,Icicle Boy,Refrigeration R&D,Single,0,2/6/1939
|
|
||||||
404,John,Jones,Government Relations,Married,3,8/15/1984
|
|
||||||
27,John,Joseph,Government Relations,Married,2,10/31/1938
|
|
||||||
28,John,Kim Chi,Craft Services,Married,2,7/24/1990
|
|
||||||
29,John,Lee,Accounting,Single,0,12/14/1988
|
|
||||||
30,John,Littlejohn,Staff Parliamentarian,Divorced,0,10/31/1938
|
|
||||||
31,John,Many Jars,Storage Acquisition Lead,Married,3,1/8/1960
|
|
||||||
666,John,Milton,Chief Counsel,Single,2,6/6/1966
|
|
||||||
32,John,Mud Head,Apian Research Lead,Married,7,1/2/1956
|
|
||||||
33,John,Nephew,Temporal Paradox Resolution,Widowed,0,1/20/1474
|
|
||||||
34,John,Nolan,Custodian,Married,2,10/31/1938
|
|
||||||
35,John,O'Connor,Temporal Paradox Resolution,Widowed,1,5/15/2022
|
|
||||||
37,John,Omar,Imports & Exports,Single,0,9/9/1963
|
|
||||||
38,John,Parrot,Computer Design Specialist,Married,1,10/1/1970
|
|
||||||
39,John,Rajeesh,Human Resources,Divorced,1,6/12/1982
|
|
||||||
11,John,Ready to Fly,Aerial R&D,Married,1,7/4/1976
|
|
||||||
40,John,Repeat Dance,Sales,Married,3,6/8/1949
|
|
||||||
50,John,Roberts,Counsel,Married,0,4/1/1962
|
|
||||||
51,John,Scott,Administration,Single,1,10/31/1938
|
|
||||||
59,John,Shaw,Administration,Single,0,8/15/1984
|
|
||||||
3,John,Small Berries,Orbital Ergonomitrics Team Leader,Married,3,10/17/1948
|
|
||||||
4,John,Starbird,Orbital Ergonomitrics,Married,2,10/18/1948
|
|
||||||
5,John,Take Cover,Security Administrator,Divorced,0,2/15/2019
|
|
||||||
6,John,Thorny Stick,Sales,Married,0,8/15/1984
|
|
||||||
7,John,Turk,System Administrator,Engaged,0,11/3/1986
|
|
||||||
8,John,Two Horns,Paralegal,Married,0,8/15/1984
|
|
||||||
9,John,Web,IT Support,Married,1,6/8/1949
|
|
||||||
1,John,Whorfin,CEO / Lord,Single,0,10/31/1938
|
|
||||||
99,John,Wood,Sales,Married,1,8/12/1948
|
|
||||||
100,John,Wright,Orbital Mechanics Supervisor,Married,2,10/4/1985
|
|
||||||
101,John,Ya Ya,Computer Design Specialist,Married,0,10/31/1938
|
|
|
1
data/parks.json
Normal file
1
data/parks.json
Normal file
File diff suppressed because one or more lines are too long
@ -1,49 +0,0 @@
|
|||||||
# imports we'll use in this example
|
|
||||||
from spatula import HtmlPage, HtmlListPage, CSS, XPath, SelectorError
|
|
||||||
|
|
||||||
|
|
||||||
class EmployeeList(HtmlListPage):
|
|
||||||
# by providing this here, it can be omitted on the command line
|
|
||||||
# useful in cases where the scraper is only meant for one page
|
|
||||||
source = "https://yoyodyne-propulsion.herokuapp.com/staff"
|
|
||||||
source = "http://localhost:5000/staff"
|
|
||||||
|
|
||||||
# each row represents an employee
|
|
||||||
selector = CSS("#employees tbody tr")
|
|
||||||
|
|
||||||
def process_item(self, item):
|
|
||||||
# this function is called for each <tr> we get from the selector
|
|
||||||
# we know there are 4 <tds>
|
|
||||||
first, last, position, details = item.getchildren()
|
|
||||||
return EmployeeDetail(
|
|
||||||
dict(
|
|
||||||
first=first.text,
|
|
||||||
last=last.text,
|
|
||||||
position=position.text,
|
|
||||||
),
|
|
||||||
source=XPath("./a/@href").match_one(details),
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_next_source(self):
|
|
||||||
try:
|
|
||||||
return XPath("//a[contains(text(), 'Next')]/@href").match_one(self.root)
|
|
||||||
except SelectorError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class EmployeeDetail(HtmlPage):
|
|
||||||
def process_page(self):
|
|
||||||
marital_status = CSS("#status").match_one(self.root)
|
|
||||||
children = CSS("#children").match_one(self.root)
|
|
||||||
hired = CSS("#hired").match_one(self.root)
|
|
||||||
return dict(
|
|
||||||
marital_status=marital_status.text,
|
|
||||||
children=children.text,
|
|
||||||
hired=hired.text,
|
|
||||||
# self.input is the data passed in from the prior scrape,
|
|
||||||
# in this case a dict we can expand here
|
|
||||||
**self.input,
|
|
||||||
)
|
|
||||||
|
|
||||||
def process_error_response(self, exception):
|
|
||||||
self.logger.warning(exception)
|
|
1033
poetry.lock
generated
1033
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -7,7 +7,6 @@ authors = ["James Turk <dev@jamesturk.net>"]
|
|||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
Flask = "^2.0.1"
|
Flask = "^2.0.1"
|
||||||
spatula = "^0.6.0"
|
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
|
|
||||||
|
@ -7,17 +7,18 @@
|
|||||||
</h2>
|
</h2>
|
||||||
<div class="section">
|
<div class="section">
|
||||||
<p>
|
<p>
|
||||||
This site exists to demonstrate some basic scraping techniques, as well as provide examples of real world challenges that scraper authors often encounter. It was primarily written to assist with examples and documentation for <a href="https://github.com/jamesturk/spatula">spatula</a> but feel free to use it for whatever you want. Just be nice, the site is hosted on a free <a href="https://fly.io">fly.io</a> VM.
|
This site is generated from data scraped from <a href="https://www.chicagoparkdistrict.com/about-us/history-chicagos-parks">Chicago Park District</a>. It is not meant to be authoritative and is not kept up to date.
|
||||||
|
<br>
|
||||||
|
This site was originally created as a demo for <a href="https://github.com/jamesturk/spatula">spatula</a> and updated for CAPP 30122 at the University of Chicago.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
This site exists to demonstrate some basic scraping techniques, as well as provide examples of real world challenges that scraper authors often encounter. You may use it for this purpose however you see fit. Just be nice, the site is hosted on a free <a href="https://fly.io">fly.io</a> VM.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p>While other sites with a similar purpose exist, they often omit some of the common complications that can arise in scraping. The eventual goal of this site is to serve as a test suite of sorts for handling edge cases.
|
<p>While other sites with a similar purpose exist, they often omit some of the common complications that can arise in scraping. The eventual goal of this site is to serve as a test suite of sorts for handling edge cases.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p>This site is the work of <a href="https://jamesturk.net">James Turk</a>. James has been getting paid to write web scrapers for over 10 years, mainly on the <a href="https://openstates.org">Open States</a> project. This work is created independently of any employer, past or present.</p>
|
|
||||||
|
|
||||||
<p>The full source for this site is available on <a href="https://github.com/jamesturk/yoyodyne-propulsion">GitHub</a>.
|
|
||||||
|
|
||||||
<p>If you're wondering about what all the nonsense is about... <a href="https://letterboxd.com/film/the-adventures-of-buckaroo-banzai-across-the-8th-dimension/">here you go</a>.</p>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
@ -1,26 +0,0 @@
|
|||||||
{% extends "base.html" %}
|
|
||||||
|
|
||||||
{% block base %}
|
|
||||||
<div class="card fluid">
|
|
||||||
<h1 class="section">Yoyodyne Propulsion Systems <small>Awards</small></h1>
|
|
||||||
<div class="section">
|
|
||||||
<div class="row">
|
|
||||||
|
|
||||||
{% for award in awards %}
|
|
||||||
<div class="card large">
|
|
||||||
<div class="section">
|
|
||||||
<h2>{{ award.name }} <small>{{ award.year }}</small></h2>
|
|
||||||
<dl>
|
|
||||||
<dt>For<dt>
|
|
||||||
<dd>{{ award.for }}</dd>
|
|
||||||
<dt>Awarded To</dt>
|
|
||||||
<dd>{{ award.to }}</dd>
|
|
||||||
</dl>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{% endfor %}
|
|
||||||
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
{% endblock %}
|
|
@ -6,10 +6,9 @@
|
|||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<header>
|
<header>
|
||||||
<a href="#" class="logo">YPS</a>
|
<a href="#" class="logo">Chicago Parks</a>
|
||||||
<a href="/" class="button">Home</a>
|
<a href="/" class="button">Home</a>
|
||||||
<a href="/staff" class="button">Staff</a>
|
<a href="/parks" class="button">Parks</a>
|
||||||
<a href="/awards" class="button">Awards</a>
|
|
||||||
<a href="/about" class="button">About</a>
|
<a href="/about" class="button">About</a>
|
||||||
</header>
|
</header>
|
||||||
{% block base %}
|
{% block base %}
|
||||||
|
@ -2,11 +2,10 @@
|
|||||||
|
|
||||||
{% block base %}
|
{% block base %}
|
||||||
<div class="card fluid">
|
<div class="card fluid">
|
||||||
<h1 class="section">Yoyodyne Propulsion Systems <small>The future begins tomorrow</small></h1>
|
<h1 class="section">Chicago Parks</h1>
|
||||||
<div class="section">
|
<div class="section">
|
||||||
<p>Welcome to Yoyodyne Propulsion Systems, where the future begins tomorrow.</p>
|
|
||||||
<p>
|
<p>
|
||||||
Since 1938 we have been producing key technological components for the success of our great nation, from our home office in New Jersey.
|
One of America's best-kept secrets is Chicago's historic park system. Even Chicagoans who routinely enjoy its diverse open spaces- from the magnificent lakeshore parks to intimate neighborhood settings- may be surprised about their parkland legacy. We invite you to learn more about the history of Chicago parks, which are second to none in America and abroad.
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -2,23 +2,21 @@
|
|||||||
|
|
||||||
{% block base %}
|
{% block base %}
|
||||||
<div class="card fluid">
|
<div class="card fluid">
|
||||||
<h1 class="section">Yoyodyne Propulsion Systems <small>Staff Roster</small></h1>
|
<h1 class="section">Chicago Parks<small>List of Parks</small></h1>
|
||||||
<div class="section">
|
<div class="section">
|
||||||
<table id="employees" style="max-height: 100%;">
|
<table id="employees" style="max-height: 100%;">
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>First Name</th>
|
<th>Name</th>
|
||||||
<th>Last Name</th>
|
<th>Location</th>
|
||||||
<th>Position Name</th>
|
|
||||||
<th> </th>
|
<th> </th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for e in employees %}
|
{% for e in parks %}
|
||||||
<tr>
|
<tr>
|
||||||
<td>{{ e.first }}</td>
|
<td>{{ e.title }}</td>
|
||||||
<td>{{ e.last }}</td>
|
<td>{{ e.address }}</td>
|
||||||
<td>{{ e.position }}</td>
|
|
||||||
<td><a href="/staff/{{ e.id }}">Details</a></td>
|
<td><a href="/staff/{{ e.id }}">Details</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
Loading…
Reference in New Issue
Block a user