saucebrush/saucebrush/sources.py

265 lines
8.5 KiB
Python
Raw Normal View History

2008-10-27 19:13:50 +00:00
"""
Saucebrush data sources, convert data in some format into python dicts.
All sources must implement the iterable interface and return python
dictionaries.
"""
import string
2008-11-04 21:39:40 +00:00
from saucebrush import utils
2008-10-27 19:13:50 +00:00
class CSVSource(object):
""" Saucebrush source for reading from CSV files.
Takes an open csvfile, an optional set of fieldnames and optional number
of rows to skip.
CSVSource(open('test.csv')) will read a csvfile, using the first row as
the field names.
CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read
in a CSV file and treat the three columns as name, phone, and address,
ignoring the first row (presumed to be column names).
"""
def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs):
2008-10-27 19:13:50 +00:00
import csv
self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs)
2012-03-12 05:26:29 +00:00
for _ in range(skiprows):
next(self._dictreader)
2008-10-27 19:13:50 +00:00
def __iter__(self):
return self._dictreader
class FixedWidthFileSource(object):
""" Saucebrush source for reading from fixed width field files.
FixedWidthFileSource expects an open fixed width file and a tuple
of fields with their lengths. There is also an optional fillchars
command that is the filler characters to strip from the end of each
field. (defaults to whitespace)
FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12)))
will read in a fixed width file where the first 30 characters of each
line are part of a name and the characters 31-42 are a phone number.
"""
def __init__(self, fwfile, fields, fillchars=string.whitespace):
self._fwfile = fwfile
self._fields_dict = {}
self._fillchars = fillchars
from_offset = 0
to_offset = 0
for field, size in fields:
to_offset += size
self._fields_dict[field] = (from_offset, to_offset)
from_offset += size
def __iter__(self):
return self
def __next__(self):
line = next(self._fwfile)
2008-10-27 19:13:50 +00:00
record = {}
for name, range_ in self._fields_dict.items():
2010-02-21 18:44:39 +00:00
record[name] = line[range_[0]:range_[1]].rstrip(self._fillchars)
2008-10-27 19:13:50 +00:00
return record
def next(self):
""" Keep Python 2 next() method that defers to __next__().
"""
return self.__next__()
2008-10-27 19:13:50 +00:00
class HtmlTableSource(object):
""" Saucebrush source for reading data from an HTML table.
HtmlTableSource expects an open html file, the id of the table or a
number indicating which table on the page to use, an optional fieldnames
tuple, and an optional number of rows to skip.
HtmlTableSource(open('test.html'), 0) opens the first HTML table and
uses the first row as the names of the columns.
HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens
the HTML table with an id of 'people' and names the two columns
name and phone, skipping the first row where alternate names are
stored.
"""
def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
# extract the table
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(htmlfile.read())
if isinstance(id_or_num, int):
table = soup.findAll('table')[id_or_num]
elif isinstance(id_or_num, str):
table = soup.find('table', id=id_or_num)
# skip the necessary number of rows
self._rows = table.findAll('tr')[skiprows:]
# determine the fieldnames
if not fieldnames:
2008-11-04 21:39:40 +00:00
self._fieldnames = [td.string
for td in self._rows[0].findAll(('td','th'))]
2008-10-27 19:13:50 +00:00
else:
self._fieldnames = fieldnames
2008-11-04 21:39:40 +00:00
def process_tr(self):
2008-10-27 19:13:50 +00:00
for row in self._rows:
2008-11-04 21:39:40 +00:00
strings = [utils.string_dig(td) for td in row.findAll('td')]
2008-10-27 19:13:50 +00:00
yield dict(zip(self._fieldnames, strings))
def __iter__(self):
return self.process_tr()
class DjangoModelSource(object):
""" Saucebrush source for reading data from django models.
DjangoModelSource expects a django settings file, app label, and model
name. The resulting records contain all columns in the table for the
specified model.
DjangoModelSource('settings.py', 'phonebook', 'friend') would read all
friends from the friend model in the phonebook app described in
settings.py.
"""
def __init__(self, dj_settings, app_label, model_name):
2008-11-04 21:39:40 +00:00
dbmodel = utils.get_django_model(dj_settings, app_label, model_name)
2008-10-27 19:13:50 +00:00
# only get values defined in model (no extra fields from custom manager)
2008-11-04 21:39:40 +00:00
self._data = dbmodel.objects.values(*[f.name
for f in dbmodel._meta.fields])
2008-10-27 19:13:50 +00:00
def __iter__(self):
return iter(self._data)
2009-07-22 18:24:21 +00:00
class MongoDBSource(object):
""" Source for reading from a MongoDB database.
2010-06-17 21:06:28 +00:00
2009-07-22 18:24:21 +00:00
The record dict is populated with records matching the spec
from the specified database and collection.
"""
def __init__(self, database, collection, spec=None, host='localhost', port=27017, conn=None):
if not conn:
from pymongo.connection import Connection
conn = Connection(host, port)
self.collection = conn[database][collection]
self.spec = spec
2010-06-17 21:06:28 +00:00
2009-07-22 18:24:21 +00:00
def __iter__(self):
return self._find_spec()
2010-06-17 21:06:28 +00:00
2009-07-22 18:24:21 +00:00
def _find_spec(self):
for doc in self.collection.find(self.spec):
yield dict(doc)
2009-07-28 20:44:21 +00:00
# dict_factory for sqlite source
def dict_factory(cursor, row):
d = { }
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
2009-07-22 18:24:21 +00:00
class SqliteSource(object):
""" Source that reads from a sqlite database.
The record dict is populated with the results from the
query argument. If given, args will be passed to the query
2010-06-17 21:06:28 +00:00
when executed.
2009-07-22 18:24:21 +00:00
"""
def __init__(self, dbpath, query, args=None, conn_params=None):
2009-07-28 20:44:21 +00:00
import sqlite3
2010-06-17 21:06:28 +00:00
2009-07-22 18:24:21 +00:00
self._dbpath = dbpath
self._query = query
self._args = args or []
self._conn_params = conn_params or []
2010-06-17 21:06:28 +00:00
2009-07-28 20:44:21 +00:00
# setup connection
self._conn = sqlite3.connect(self._dbpath)
self._conn.row_factory = dict_factory
2009-07-22 18:24:21 +00:00
if self._conn_params:
for param, value in self._conn_params.items():
2009-07-28 20:44:21 +00:00
setattr(self._conn, param, value)
2009-07-22 18:24:21 +00:00
2009-07-28 20:44:21 +00:00
def _process_query(self):
2010-06-17 21:06:28 +00:00
2009-07-28 20:44:21 +00:00
cursor = self._conn.cursor()
2009-07-22 18:24:21 +00:00
for row in cursor.execute(self._query, self._args):
yield row
cursor.close()
def __iter__(self):
return self._process_query()
2010-06-17 21:06:28 +00:00
2009-07-28 20:44:21 +00:00
def done(self):
self._conn.close()
2010-06-17 21:06:28 +00:00
class FileSource(object):
""" Base class for sources which read from one or more files.
Takes as input a file-like, a file path, a list of file-likes,
or a list of file paths.
"""
def __init__(self, input):
self._input = input
def __iter__(self):
# This method would be a lot cleaner with the proposed
# 'yield from' expression (PEP 380)
if hasattr(self._input, '__read__'):
for record in self._process_file(input):
yield record
elif isinstance(self._input, basestring):
with open(self._input) as f:
for record in self._process_file(f):
2010-06-17 21:06:28 +00:00
yield record
elif hasattr(self._input, '__iter__'):
for el in self._input:
if isinstance(el, basestring):
with open(el) as f:
for record in self._process_file(f):
yield record
elif hasattr(el, '__read__'):
for record in self._process_file(f):
yield record
def _process_file(self, file):
raise NotImplementedError('Descendants of FileSource should implement'
' a custom _process_file method.')
class JSONSource(FileSource):
""" Source for reading from JSON files.
When processing JSON files, if the top-level object is a list, will
yield each member separately. Otherwise, yields the top-level
object.
"""
def _process_file(self, file):
import json
obj = json.load(file)
# If the top-level JSON object in the file is a list
# then yield each element separately; otherwise, yield
# the top-level object.
if isinstance(obj, list):
for record in obj:
yield record
else:
yield obj