saucebrush/saucebrush/sources.py

"""
    Saucebrush data sources, convert data in some format into python dicts.

    All sources must implement the iterable interface and return python
    dictionaries.
"""

import string
from saucebrush import utils

class CSVSource(object):
    """ Saucebrush source for reading from CSV files.

        Takes an open csvfile, an optional set of fieldnames and optional number
        of rows to skip.

        CSVSource(open('test.csv')) will read a csvfile, using the first row as
        the field names.

        CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read
        in a CSV file and treat the three columns as name, phone, and address,
        ignoring the first row (presumed to be column names).
    """

    def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs):
        import csv
        self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs)
        for _ in xrange(skiprows):
            self._dictreader.next()

    def __iter__(self):
        return self._dictreader


class FixedWidthFileSource(object):
    """ Saucebrush source for reading from fixed width field files.

        FixedWidthFileSource expects an open fixed width file and a tuple
        of fields with their lengths.  There is also an optional fillchars
        command that is the filler characters to strip from the end of each
        field. (defaults to whitespace)

        FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12)))
        will read in a fixed width file where the first 30 characters of each
        line are part of a name and the characters 31-42 are a phone number.
    """

    def __init__(self, fwfile, fields, fillchars=string.whitespace):
        self._fwfile = fwfile
        self._fields_dict = {}
        self._fillchars = fillchars
        from_offset = 0
        to_offset = 0
        for field, size in fields:
            to_offset += size
            self._fields_dict[field] = (from_offset, to_offset)
            from_offset += size

    def __iter__(self):
        return self

    def next(self):
        line = self._fwfile.next()
        record = {}
        for name, range_ in self._fields_dict.iteritems():
            record[name] = line[range_[0]:range_[1]].rstrip(self._fillchars)
        return record


class HtmlTableSource(object):
    """ Saucebrush source for reading data from an HTML table.

        HtmlTableSource expects an open html file, the id of the table or a
        number indicating which table on the page to use, an optional fieldnames
        tuple, and an optional number of rows to skip.

        HtmlTableSource(open('test.html'), 0) opens the first HTML table and
        uses the first row as the names of the columns.

        HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens
        the HTML table with an id of 'people' and names the two columns
        name and phone, skipping the first row where alternate names are
        stored.
    """

    def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):

        # extract the table
        from BeautifulSoup import BeautifulSoup
        soup = BeautifulSoup(htmlfile.read())
        if isinstance(id_or_num, int):
            table = soup.findAll('table')[id_or_num]
        elif isinstance(id_or_num, str):
            table = soup.find('table', id=id_or_num)

        # skip the necessary number of rows
        self._rows = table.findAll('tr')[skiprows:]

        # determine the fieldnames
        if not fieldnames:
            self._fieldnames = [td.string
                                for td in self._rows[0].findAll(('td','th'))]
        else:
            self._fieldnames = fieldnames

    def process_tr(self):
        for row in self._rows:
            strings = [utils.string_dig(td) for td in row.findAll('td')]
            yield dict(zip(self._fieldnames, strings))

    def __iter__(self):
        return self.process_tr()


class DjangoModelSource(object):
    """ Saucebrush source for reading data from django models.

        DjangoModelSource expects a django settings file, app label, and model
        name.  The resulting records contain all columns in the table for the
        specified model.

        DjangoModelSource('settings.py', 'phonebook', 'friend') would read all
        friends from the friend model in the phonebook app described in
        settings.py.
    """
    def __init__(self, dj_settings, app_label, model_name):
        dbmodel = utils.get_django_model(dj_settings, app_label, model_name)

        # only get values defined in model (no extra fields from custom manager)
        self._data = dbmodel.objects.values(*[f.name
                                              for f in dbmodel._meta.fields])

    def __iter__(self):
        return iter(self._data)


class MongoDBSource(object):
    """ Source for reading from a MongoDB database.
    
        The record dict is populated with records matching the spec
        from the specified database and collection.
    """
    def __init__(self, database, collection, spec=None, host='localhost', port=27017, conn=None):
        if not conn:
            from pymongo.connection import Connection
            conn = Connection(host, port)
        self.collection = conn[database][collection]
        self.spec = spec
    
    def __iter__(self):
        return self._find_spec()
    
    def _find_spec(self):
        for doc in self.collection.find(self.spec):
            yield dict(doc)

# dict_factory for sqlite source
def dict_factory(cursor, row):
    d = { }
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

class SqliteSource(object):
    """ Source that reads from a sqlite database.

        The record dict is populated with the results from the
        query argument. If given, args will be passed to the query
        when executed.    
    """

    def __init__(self, dbpath, query, args=None, conn_params=None):

        import sqlite3
        
        self._dbpath = dbpath
        self._query = query
        self._args = args or []
        self._conn_params = conn_params or []
        
        # setup connection
        self._conn = sqlite3.connect(self._dbpath)
        self._conn.row_factory = dict_factory
        if self._conn_params:
            for param, value in self._conn_params.iteritems():
                setattr(self._conn, param, value)

    def _process_query(self):
        
        cursor = self._conn.cursor()

        for row in cursor.execute(self._query, self._args):
            yield row

        cursor.close()

    def __iter__(self):
        return self._process_query()
    
    def done(self):
        self._conn.close()
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`"""`
			`Saucebrush data sources, convert data in some format into python dicts.`

			`All sources must implement the iterable interface and return python`
			`dictionaries.`
			`"""`

			`import string`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`from saucebrush import utils`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00
			`class CSVSource(object):`
			`""" Saucebrush source for reading from CSV files.`

			`Takes an open csvfile, an optional set of fieldnames and optional number`
			`of rows to skip.`

			`CSVSource(open('test.csv')) will read a csvfile, using the first row as`
			`the field names.`

			`CSVSource(open('test.csv'), ('name', 'phone', 'address'), 1) will read`
			`in a CSV file and treat the three columns as name, phone, and address,`
			`ignoring the first row (presumed to be column names).`
			`"""`

add replace option to sqliteemitter and add better kwargs to dictreader on csvsource 2009-09-01 21:45:05 +00:00			`def __init__(self, csvfile, fieldnames=None, skiprows=0, **kwargs):`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`import csv`
add replace option to sqliteemitter and add better kwargs to dictreader on csvsource 2009-09-01 21:45:05 +00:00			`self._dictreader = csv.DictReader(csvfile, fieldnames, **kwargs)`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`for _ in xrange(skiprows):`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`self._dictreader.next()`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00
			`def __iter__(self):`
			`return self._dictreader`


			`class FixedWidthFileSource(object):`
			`""" Saucebrush source for reading from fixed width field files.`

			`FixedWidthFileSource expects an open fixed width file and a tuple`
			`of fields with their lengths. There is also an optional fillchars`
			`command that is the filler characters to strip from the end of each`
			`field. (defaults to whitespace)`

			`FixedWidthFileSource(open('testfile'), (('name',30), ('phone',12)))`
			`will read in a fixed width file where the first 30 characters of each`
			`line are part of a name and the characters 31-42 are a phone number.`
			`"""`

			`def __init__(self, fwfile, fields, fillchars=string.whitespace):`
			`self._fwfile = fwfile`
			`self._fields_dict = {}`
			`self._fillchars = fillchars`
			`from_offset = 0`
			`to_offset = 0`
			`for field, size in fields:`
			`to_offset += size`
			`self._fields_dict[field] = (from_offset, to_offset)`
			`from_offset += size`

			`def __iter__(self):`
			`return self`

			`def next(self):`
			`line = self._fwfile.next()`
			`record = {}`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`for name, range_ in self._fields_dict.iteritems():`
basic source tests 2010-02-21 18:44:39 +00:00			`record[name] = line[range_[0]:range_[1]].rstrip(self._fillchars)`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`return record`


			`class HtmlTableSource(object):`
			`""" Saucebrush source for reading data from an HTML table.`

			`HtmlTableSource expects an open html file, the id of the table or a`
			`number indicating which table on the page to use, an optional fieldnames`
			`tuple, and an optional number of rows to skip.`

			`HtmlTableSource(open('test.html'), 0) opens the first HTML table and`
			`uses the first row as the names of the columns.`

			`HtmlTableSource(open('test.html'), 'people', ('name','phone'), 1) opens`
			`the HTML table with an id of 'people' and names the two columns`
			`name and phone, skipping the first row where alternate names are`
			`stored.`
			`"""`

			`def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):`

			`# extract the table`
			`from BeautifulSoup import BeautifulSoup`
			`soup = BeautifulSoup(htmlfile.read())`
			`if isinstance(id_or_num, int):`
			`table = soup.findAll('table')[id_or_num]`
			`elif isinstance(id_or_num, str):`
			`table = soup.find('table', id=id_or_num)`

			`# skip the necessary number of rows`
			`self._rows = table.findAll('tr')[skiprows:]`

			`# determine the fieldnames`
			`if not fieldnames:`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`self._fieldnames = [td.string`
			`for td in self._rows[0].findAll(('td','th'))]`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`else:`
			`self._fieldnames = fieldnames`

minor pylint cleanups 2008-11-04 21:39:40 +00:00			`def process_tr(self):`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`for row in self._rows:`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`strings = [utils.string_dig(td) for td in row.findAll('td')]`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00			`yield dict(zip(self._fieldnames, strings))`

			`def __iter__(self):`
			`return self.process_tr()`


			`class DjangoModelSource(object):`
			`""" Saucebrush source for reading data from django models.`

			`DjangoModelSource expects a django settings file, app label, and model`
			`name. The resulting records contain all columns in the table for the`
			`specified model.`

			`DjangoModelSource('settings.py', 'phonebook', 'friend') would read all`
			`friends from the friend model in the phonebook app described in`
			`settings.py.`
			`"""`
			`def __init__(self, dj_settings, app_label, model_name):`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`dbmodel = utils.get_django_model(dj_settings, app_label, model_name)`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00
			`# only get values defined in model (no extra fields from custom manager)`
minor pylint cleanups 2008-11-04 21:39:40 +00:00			`self._data = dbmodel.objects.values(*[f.name`
			`for f in dbmodel._meta.fields])`
saucebrush moving from bzr to svn :( 2008-10-27 19:13:50 +00:00
			`def __iter__(self):`
			`return iter(self._data)`
add MongoDB and sqlite sources 2009-07-22 18:24:21 +00:00

			`class MongoDBSource(object):`
			`""" Source for reading from a MongoDB database.`

			`The record dict is populated with records matching the spec`
			`from the specified database and collection.`
			`"""`
			`def __init__(self, database, collection, spec=None, host='localhost', port=27017, conn=None):`
			`if not conn:`
			`from pymongo.connection import Connection`
			`conn = Connection(host, port)`
			`self.collection = conn[database][collection]`
			`self.spec = spec`

			`def __iter__(self):`
			`return self._find_spec()`

			`def _find_spec(self):`
			`for doc in self.collection.find(self.spec):`
			`yield dict(doc)`

update sqlitesource 2009-07-28 20:44:21 +00:00			`# dict_factory for sqlite source`
			`def dict_factory(cursor, row):`
			`d = { }`
			`for idx, col in enumerate(cursor.description):`
			`d[col[0]] = row[idx]`
			`return d`
add MongoDB and sqlite sources 2009-07-22 18:24:21 +00:00
			`class SqliteSource(object):`
			`""" Source that reads from a sqlite database.`

			`The record dict is populated with the results from the`
			`query argument. If given, args will be passed to the query`
			`when executed.`
			`"""`

			`def __init__(self, dbpath, query, args=None, conn_params=None):`
update sqlitesource 2009-07-28 20:44:21 +00:00
			`import sqlite3`

add MongoDB and sqlite sources 2009-07-22 18:24:21 +00:00			`self._dbpath = dbpath`
			`self._query = query`
			`self._args = args or []`
			`self._conn_params = conn_params or []`

update sqlitesource 2009-07-28 20:44:21 +00:00			`# setup connection`
			`self._conn = sqlite3.connect(self._dbpath)`
			`self._conn.row_factory = dict_factory`
add MongoDB and sqlite sources 2009-07-22 18:24:21 +00:00			`if self._conn_params:`
			`for param, value in self._conn_params.iteritems():`
update sqlitesource 2009-07-28 20:44:21 +00:00			`setattr(self._conn, param, value)`
add MongoDB and sqlite sources 2009-07-22 18:24:21 +00:00
update sqlitesource 2009-07-28 20:44:21 +00:00			`def _process_query(self):`

			`cursor = self._conn.cursor()`
add MongoDB and sqlite sources 2009-07-22 18:24:21 +00:00
			`for row in cursor.execute(self._query, self._args):`
			`yield row`

			`cursor.close()`

			`def __iter__(self):`
			`return self._process_query()`
update sqlitesource 2009-07-28 20:44:21 +00:00
			`def done(self):`
			`self._conn.close()`