Compare commits

..

No commits in common. "master" and "0.3.2" have entirely different histories.

23 changed files with 153 additions and 362 deletions

2
.gitignore vendored
View File

@ -1,5 +1,3 @@
celerybeat-schedule
oyster_settings.py
oyster.egg-info/
*.pyc
.tox

View File

@ -2,9 +2,8 @@ language: python
python:
- "2.6"
- "2.7"
install: pip install scrapelib pymongo nose celery --use-mirrors
install: pip install scrapelib pymongo nose --use-mirrors
script: nosetests
services: mongodb
notifications:
email:
- jturk@sunlightfoundation.com

View File

@ -1,5 +1,3 @@
**DEPRECATED** - this project is abandoned & will not be seeing future updates
======
oyster
======

View File

@ -1,14 +1,6 @@
oyster changelog
================
0.4.0-dev
---------
* S3 storage backend bugfix
* lots of improvements to signal script
* oyster.ext cloudsearch, elasticsearch, and superfastmatch
* use python logging w/ mongo handler
* add tox/python setup.py test (thanks Marc Abramowitz!)
0.3.2
-----
**2012-03-29**

View File

@ -1,4 +1,4 @@
__version__ = "0.4.0-dev"
__version__ = "0.3.2"
import os
os.environ['CELERY_CONFIG_MODULE'] = 'oyster.celeryconfig'

View File

@ -1,3 +1,13 @@
from oyster.conf import settings
CELERY_IMPORTS = ['oyster.tasks'] + list(settings.CELERY_TASK_MODULES)
CELERY_IMPORTS = ["oyster.tasks"] + list(settings.CELERY_TASK_MODULES)
BROKER_TRANSPORT = 'mongodb'
BROKER_HOST = settings.MONGO_HOST
BROKER_PORT = settings.MONGO_PORT
CELERY_RESULT_BACKEND = 'mongodb'
CELERY_MONGODB_BACKEND_SETTINGS = {
'host': settings.MONGO_HOST,
'port': settings.MONGO_PORT,
}

View File

@ -1,5 +1,4 @@
import datetime
import logging
import hashlib
import random
import sys
@ -7,9 +6,7 @@ import sys
import pymongo
import scrapelib
from .mongolog import MongoHandler
from .storage import engines
from celery.execute import send_task
class Kernel(object):
@ -25,22 +22,21 @@ class Kernel(object):
configurable for ease of testing, only one should be instantiated
"""
# set up the log
# set up a capped log if it doesn't exist
self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
self.log = logging.getLogger('oyster')
self.log.setLevel(logging.DEBUG)
self.log.addHandler(MongoHandler(mongo_db, host=mongo_host,
port=mongo_port,
capped_size=mongo_log_maxsize))
try:
self.db.create_collection('logs', capped=True,
size=mongo_log_maxsize)
except pymongo.errors.CollectionInvalid:
# cap collection if not capped?
pass
# create status document if it doesn't exist
if self.db.status.count() == 0:
self.db.status.insert({'update_queue': 0})
# ensure an index on _random
self.db.tracked.ensure_index('_random')
self.db.tracked.ensure_index('url')
self.db.tracked.ensure_index([('_random', pymongo.ASCENDING)])
self.scraper = scrapelib.Scraper(user_agent=user_agent,
requests_per_minute=rpm,
@ -74,6 +70,14 @@ class Kernel(object):
self.db.drop_collection('logs')
self.db.drop_collection('status')
def log(self, action, url, error=False, **kwargs):
""" add an entry to the oyster log """
kwargs['action'] = action
kwargs['url'] = url
kwargs['error'] = error
kwargs['timestamp'] = datetime.datetime.utcnow()
self.db.logs.insert(kwargs)
def _add_doc_class(self, doc_class, **properties):
self.doc_classes[doc_class] = properties
@ -89,16 +93,17 @@ class Kernel(object):
any keyword args will be added to the document's metadata
"""
if doc_class not in self.doc_classes:
error = 'error tracking %s, unregistered doc_class %s'
self.log.error(error, url, doc_class)
raise ValueError(error % (url, doc_class))
error = 'unregistered doc_class %s' % doc_class
self.log('track', url=url, error=error)
raise ValueError(error)
# try and find an existing version of this document
tracked = None
if id:
tracked = self.db.tracked.find_one({'_id': id})
else:
if not tracked:
tracked = self.db.tracked.find_one({'url': url})
# if id exists, ensure that URL and doc_class haven't changed
@ -112,14 +117,15 @@ class Kernel(object):
return tracked['_id']
else:
# id existed but with different URL
message = ('%s already exists with different data (tracked: '
'%s, %s) (new: %s, %s)')
args = (tracked['_id'], tracked['url'], tracked['doc_class'],
url, doc_class)
self.log.error(message, *args)
raise ValueError(message % args)
error = ('%s already exists with different data (tracked: '
'%s, %s) (new: %s, %s)' % (tracked['_id'],
tracked['url'],
tracked['doc_class'],
url, doc_class))
self.log('track', url=url, error=error)
raise ValueError(error)
self.log.info('tracked %s [%s]', url, id)
self.log('track', url=url)
newdoc = dict(url=url, doc_class=doc_class,
_random=random.randint(0, sys.maxint),
@ -181,8 +187,8 @@ class Kernel(object):
'storage_type': storage.storage_type,
})
# fire off onchanged functions
for onchanged in doc_class.get('onchanged', []):
send_task(onchanged, (doc['_id'],))
for onchanged in doc_class['onchanged']:
onchanged(doc, newdata)
if error:
# if there's been an error, increment the consecutive_errors count
@ -202,11 +208,7 @@ class Kernel(object):
else:
doc['next_update'] = None
if error:
self.log.warning('error updating %s [%s]', url, doc['_id'])
else:
new_version = ' (new)'
self.log.info('updated %s [%s]%s', url, doc['_id'], new_version)
self.log('update', url=url, new_doc=new_version, error=error)
self.db.tracked.save(doc, safe=True)
@ -255,16 +257,6 @@ class Kernel(object):
storage = self.storage[doc_class['storage_engine']]
return storage.get(doc['versions'][-1]['storage_key'])
def extract_text(self, doc):
version = self.get_last_version(doc)
doc_class = self.doc_classes[doc['doc_class']]
try:
extract_text = doc_class['extract_text']
except KeyError:
raise ValueError('doc_class %s missing extract_text' %
doc['doc_class'])
return extract_text(doc, version)
def _get_configured_kernel():
""" factory, gets a connection configured with oyster.conf.settings """

View File

View File

@ -1,32 +0,0 @@
# needed so we can import cloudsearch
from __future__ import absolute_import
from celery.task.base import Task
from ..core import kernel
from ..conf import settings
from cloudsearch import CloudSearch
cs = CloudSearch(settings.CLOUDSEARCH_DOMAIN, settings.CLOUDSEARCH_ID, 20)
class CloudSearchPush(Task):
""" task that updates documents """
# results go straight to database
ignore_result = True
# a bit under 1MB
MAX_BYTES = 1048000
def run(self, doc_id):
doc = kernel.db.tracked.find_one({'_id': doc_id})
text = kernel.extract_text(doc)
pieces = [text[i:i+self.MAX_BYTES] for i in
xrange(0, len(text), self.MAX_BYTES)]
self.get_logger().debug('adding {0} pieces for {1}'.format(
len(pieces), doc_id))
for i, piece in enumerate(pieces):
cloud_id = '%s_%s' % (doc_id.lower(), i)
cs.add_document(cloud_id, text=piece, **doc['metadata'])

View File

@ -1,36 +0,0 @@
import logging
from celery.task.base import Task
from ..core import kernel
from ..conf import settings
from pyes import ES
es = ES(settings.ELASTICSEARCH_HOST)
log = logging.getLogger('oyster.ext.elasticsearch')
class ElasticSearchPush(Task):
# results go straight to elasticsearch
ignore_result = True
def run(self, doc_id):
doc = kernel.db.tracked.find_one({'_id': doc_id})
try:
text = kernel.extract_text(doc)
if not text:
log.info('no text for %s', doc_id,
extra={'doc_class':doc['doc_class']})
return
log.info('tracked %s', doc_id,
extra={'doc_class':doc['doc_class']})
es.index(dict(doc['metadata'], text=text),
settings.ELASTICSEARCH_INDEX,
settings.ELASTICSEARCH_DOC_TYPE,
id=doc_id)
except Exception as e:
log.warning('error tracking %s', doc_id,
extra={'doc_class':doc['doc_class']}, exc_info=True)
raise

View File

@ -1,23 +0,0 @@
# needed so we can import superfastmatch.Client
from __future__ import absolute_import
from celery.task.base import Task
from ..core import kernel
from ..conf import settings
from superfastmatch import Client
sfm = Client(settings.SUPERFASTMATCH_URL)
class SuperFastMatchPush(Task):
""" task that pushes documents to SFM """
# results go straight to database
ignore_result = True
def run(self, doc_id):
doc = kernel.db.tracked.find_one({'_id': doc_id})
text = kernel.extract_text(doc)
doctype, docid = settings.SUPERFASTMATCH_ID_FUNC(doc_id)
sfm.add(doctype, docid, text, defer=True, **doc['metadata'])

View File

@ -1,54 +0,0 @@
"""
MongoDB handler for Python Logging
inspired by https://github.com/andreisavu/mongodb-log
"""
import logging
import datetime
import socket
import pymongo
class MongoFormatter(logging.Formatter):
def format(self, record):
""" turn a LogRecord into something mongo can store """
data = record.__dict__.copy()
data.update(
# format message
message=record.getMessage(),
# overwrite created (float) w/ a mongo-compatible datetime
created=datetime.datetime.utcnow(),
host=socket.gethostname(),
args=tuple(unicode(arg) for arg in record.args)
)
data.pop('msecs') # not needed, stored in created
# TODO: ensure everything in 'extra' is MongoDB-ready
exc_info = data.get('exc_info')
if exc_info:
data['exc_info'] = self.formatException(exc_info)
return data
class MongoHandler(logging.Handler):
def __init__(self, db, collection='logs', host='localhost', port=None,
capped_size=100000000, level=logging.NOTSET, async=True):
db = pymongo.connection.Connection(host, port)[db]
# try and create the capped log collection
if capped_size:
try:
db.create_collection(collection, capped=True, size=capped_size)
except pymongo.errors.CollectionInvalid:
pass
self.collection = db[collection]
self.async = async
logging.Handler.__init__(self, level)
self.formatter = MongoFormatter()
def emit(self, record):
# explicitly set safe=False to get async insert
# TODO: what to do if an error occurs? not safe to log-- ignore?
self.collection.save(self.format(record), safe=not self.async)

View File

@ -1,53 +1,30 @@
#!/usr/bin/env python
import argparse
import traceback
import random
from celery.execute import send_task
from celery import current_app
from oyster.core import kernel
def main():
parser = argparse.ArgumentParser(
description='do a task for all documents in a doc_class',
)
parser.add_argument('task', type=str, help='task name to apply')
parser.add_argument('function', type=str, help='path to function to apply')
parser.add_argument('doc_class', type=str,
help='doc_class to apply function to')
parser.add_argument('--sample', action='store_true')
parser.add_argument('--immediate', action='store_true')
args = parser.parse_args()
docs = kernel.db.tracked.find({'doc_class': args.doc_class,
'versions': {'$ne': []}
}, timeout=False)
total = docs.count()
print '{0} docs in {1}'.format(total, args.doc_class)
docs = kernel.db.tracked.find({'doc_class': args.doc_class})
print '%s docs in %s' % (docs.count(), args.doc_class)
if args.sample:
limit = 100
print 'sampling {0} documents'.format(limit)
docs = docs.limit(limit).skip(random.randint(0, total-limit))
args.immediate = True
path, func = args.function.rsplit('.', 1)
mod = __import__(path, fromlist=[func])
func = getattr(mod, func)
errors = 0
if args.immediate:
module, name = args.task.rsplit('.', 1)
task = getattr(__import__(module, fromlist=[name]), name)
for doc in docs:
try:
task.apply((doc['_id'],), throw=True)
except Exception:
errors += 1
traceback.print_exc()
print '{0} errors in {1} documents'.format(errors, total)
else:
for doc in docs:
send_task(args.task, (doc['_id'], ))
for doc in docs:
func(doc, kernel.get_last_version(doc))
# optionally save doc?
if __name__ == '__main__':
main()

View File

@ -14,19 +14,19 @@ class S3Storage(object):
@property
def bucket(self):
if not self._bucket:
self._bucket = self.s3conn.get_bucket(settings.AWS_BUCKET)
self._bucket = self.s3conn.create_bucket(settings.AWS_BUCKET)
return self._bucket
def _get_opt(self, doc_class, setting, default=None):
""" doc_class first, then setting, then default """
return self.kernel.doc_classes[doc_class].get(setting,
getattr(settings, setting, default))
self.kernel.doc_classes[doc_class].get(setting,
getattr(settings, setting, default))
def put(self, tracked_doc, data, content_type):
""" upload the document to S3 """
k = boto.s3.key.Key(self.bucket)
aws_prefix = self._get_opt(tracked_doc['doc_class'], 'AWS_PREFIX', '')
aws_bucket = self._get_opt(tracked_doc['doc_class'], 'AWS_BUCKET')
k = boto.s3.key.Key(self.bucket)
key_name = aws_prefix + tracked_doc['_id']
k.key = key_name
headers = {'x-amz-acl': 'public-read',

View File

@ -11,8 +11,10 @@ class UpdateTask(Task):
def run(self, doc_id):
doc = kernel.db.tracked.find_one({'_id': doc_id})
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
kernel.update(doc)
for task in doc.get('post_update_tasks', []):
send_task(task, (doc_id,))
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
# don't sit on a connection
kernel.db.connection.end_request()
@ -22,24 +24,15 @@ class UpdateTaskScheduler(PeriodicTask):
# 60s tick
run_every = 60
ignore_result = True
def run(self):
# if the update queue isn't empty, wait to add more
# (currently the only way we avoid duplicates)
# alternate option would be to set a _queued flag on documents
update_queue_size = kernel.db.status.find_one()['update_queue']
if update_queue_size:
self.get_logger().debug('waiting, update_queue_size={0}'.format(
update_queue_size))
if kernel.db.status.find_one()['update_queue']:
return
next_set = kernel.get_update_queue()
if next_set:
self.get_logger().debug('repopulating update_queue')
else:
self.get_logger().debug('kernel.update_queue empty')
for doc in next_set:
UpdateTask.delay(doc['_id'])
kernel.db.status.update({}, {'$inc': {'update_queue': 1}})

View File

@ -1,6 +1,6 @@
<tr class="{{log.levelname.lower}}">
<td>{{log.name}}</td>
<td>{{log.message}}</td>
<td>{{log.created.strftime("%Y-%m-%d %H:%M:%S")}}</td>
<td>{% if log.exc_info %}{{log.exc_info}}{% endif %}</td>
<tr{% if log.error %} class="error" {% endif %}>
<td>{{log.action}}</td>
<td><a href="{{request.script_root}}/tracked/{{log.url}}">{{log.url}}</td>
<td>{{log.timestamp.strftime("%Y-%m-%d %H:%M:%S")}}</td>
<td>{% if log.error %}{{log.error}}{% endif %}</td>
</tr>

View File

@ -2,7 +2,7 @@ import time
import datetime
from unittest import TestCase
from nose.tools import assert_raises, assert_equal
from nose.tools import assert_raises
from oyster.core import Kernel
@ -44,6 +44,8 @@ class KernelTests(TestCase):
retry_attempts=7, retry_wait_minutes=8)
assert c.db.connection.host == '127.0.0.1'
assert c.db.connection.port == 27017
assert c.db.logs.options()['capped'] == True
assert c.db.logs.options()['size'] == 5000
assert c.retry_wait_minutes == 8
# TODO: test retry_attempts
assert c.scraper.user_agent == 'test-ua'
@ -53,6 +55,15 @@ class KernelTests(TestCase):
# ensure that a bad document class raises an error
assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}})
def test_log(self):
self.kernel.log('action1', 'http://example.com')
self.kernel.log('action2', 'http://test.com', error=True, pi=3)
assert self.kernel.db.logs.count() == 2
x = self.kernel.db.logs.find_one({'error': True})
assert x['action'] == 'action2'
assert x['url'] == 'http://test.com'
assert x['pi'] == 3
def test_track_url(self):
# basic insert
id1 = self.kernel.track_url('http://example.com', 'default', pi=3)
@ -62,6 +73,11 @@ class KernelTests(TestCase):
assert obj['metadata'] == {'pi': 3}
assert obj['versions'] == []
# logging
log = self.kernel.db.logs.find_one()
assert log['action'] == 'track'
assert log['url'] == 'http://example.com'
# track same url again with same metadata returns id
id2 = self.kernel.track_url('http://example.com', 'default', pi=3)
assert id1 == id2
@ -72,12 +88,16 @@ class KernelTests(TestCase):
assert out == 'fixed-id'
# can't pass track same id twice with different url
self.kernel.db.logs.drop()
assert_raises(ValueError, self.kernel.track_url,
'http://example.com/3', 'default', 'fixed-id')
assert 'already exists' in self.kernel.db.logs.find_one()['error']
# ... or different doc class
self.kernel.db.logs.drop()
assert_raises(ValueError, self.kernel.track_url,
'http://example.com/2', 'change-hook', 'fixed-id')
assert 'already exists' in self.kernel.db.logs.find_one()['error']
# different metadata is ok, but it should be updated
self.kernel.track_url('http://example.com/2', 'default', 'fixed-id',
@ -115,6 +135,9 @@ class KernelTests(TestCase):
assert len(newobj['versions']) == 1
# check logs
assert self.kernel.db.logs.find({'action': 'update'}).count() == 1
# and do another update..
self.kernel.update(obj)
@ -125,6 +148,9 @@ class KernelTests(TestCase):
newobj = self.kernel.db.tracked.find_one()
assert first_update < newobj['last_update']
# check that logs updated
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
def test_update_failure(self):
# track a non-existent URL
self.kernel.track_url('http://not_a_url', 'default')
@ -134,37 +160,41 @@ class KernelTests(TestCase):
obj = self.kernel.db.tracked.find_one()
assert obj['consecutive_errors'] == 1
# we should have logged an error too
assert self.kernel.db.logs.find({'action': 'update',
'error': {'$ne': False}}).count() == 1
# update again
self.kernel.update(obj)
obj = self.kernel.db.tracked.find_one()
assert obj['consecutive_errors'] == 2
#def test_update_onchanged_fire_only_on_change(self):
# self.kernel.track_url('http://example.com', 'change-hook')
# obj = self.kernel.db.tracked.find_one()
# self.kernel.update(obj)
def test_update_onchanged_fire_only_on_change(self):
self.kernel.track_url('http://example.com', 'change-hook')
obj = self.kernel.db.tracked.find_one()
self.kernel.update(obj)
# doc = self.kernel.db.tracked.find_one()
# assert doc['hook_fired'] == 1
doc = self.kernel.db.tracked.find_one()
assert doc['hook_fired'] == 1
# # again, we rely on example.com not updating
# self.kernel.update(obj)
# doc = self.kernel.db.tracked.find_one()
# assert doc['hook_fired'] == 1
# again, we rely on example.com not updating
self.kernel.update(obj)
doc = self.kernel.db.tracked.find_one()
assert doc['hook_fired'] == 1
#def test_update_onchanged_fire_again_on_change(self):
# self.kernel.track_url(RANDOM_URL, 'change-hook')
# obj = self.kernel.db.tracked.find_one()
# self.kernel.update(obj)
def test_update_onchanged_fire_again_on_change(self):
self.kernel.track_url(RANDOM_URL, 'change-hook')
obj = self.kernel.db.tracked.find_one()
self.kernel.update(obj)
# doc = self.kernel.db.tracked.find_one()
# assert doc['hook_fired'] == 1
doc = self.kernel.db.tracked.find_one()
assert doc['hook_fired'] == 1
# # we rely on this URL updating
# self.kernel.update(obj)
# doc = self.kernel.db.tracked.find_one()
# assert doc['hook_fired'] == 2
# we rely on this URL updating
self.kernel.update(obj)
doc = self.kernel.db.tracked.find_one()
assert doc['hook_fired'] == 2
def test_get_update_queue(self):
self.kernel.track_url('never-updates', 'fast-update')

View File

@ -1,53 +0,0 @@
import unittest
import logging
import datetime
import pymongo
from ..mongolog import MongoHandler
class TestMongoLog(unittest.TestCase):
DB_NAME = 'oyster_test'
def setUp(self):
pymongo.Connection().drop_database(self.DB_NAME)
self.log = logging.getLogger('mongotest')
self.log.setLevel(logging.DEBUG)
self.logs = pymongo.Connection()[self.DB_NAME]['logs']
# clear handlers upon each setup
self.log.handlers = []
# async = False for testing
self.log.addHandler(MongoHandler(self.DB_NAME, capped_size=4000,
async=False))
def tearDown(self):
pymongo.Connection().drop_database(self.DB_NAME)
def test_basic_write(self):
self.log.debug('test')
self.assertEqual(self.logs.count(), 1)
self.log.debug('test')
self.assertEqual(self.logs.count(), 2)
# capped_size will limit these
self.log.debug('test'*200)
self.log.debug('test'*200)
self.assertEqual(self.logs.count(), 1)
def test_attributes(self):
self.log.debug('pi=%s', 3.14, extra={'pie':'pizza'})
logged = self.logs.find_one()
self.assertEqual(logged['message'], 'pi=3.14')
self.assertTrue(isinstance(logged['created'], datetime.datetime))
self.assertTrue('host' in logged)
self.assertEqual(logged['name'], 'mongotest')
self.assertEqual(logged['levelname'], 'DEBUG')
self.assertEqual(logged['pie'], 'pizza')
# and exc_info
try:
raise Exception('error!')
except:
self.log.warning('oh no!', exc_info=True)
logged = self.logs.find_one(sort=[('$natural', -1)])
self.assertEqual(logged['levelname'], 'WARNING')
self.assertTrue('error!' in logged['exc_info'])

View File

@ -8,14 +8,14 @@ from oyster.storage.dummy import DummyStorage
def _simple_storage_test(StorageCls):
kernel = Kernel(mongo_db='oyster_test')
kernel.doc_classes['default'] = {}
storage = StorageCls(kernel)
# ensure the class has a storage_type attribute
assert hasattr(storage, 'storage_type')
doc = {'_id': 'aabbccddeeff', 'url': 'http://localhost:8000/#test',
'doc_class': 'default', 'metadata': {} }
'metadata': {}
}
storage_id = storage.put(doc, 'hello oyster', 'text/plain')
assert storage_id

View File

@ -4,7 +4,7 @@ import datetime
import functools
import flask
import bson.objectid
import pymongo.objectid
from oyster.conf import settings
from oyster.core import kernel
@ -14,7 +14,7 @@ class JSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime.datetime):
return obj.isoformat()
elif isinstance(obj, bson.objectid.ObjectId):
elif isinstance(obj, pymongo.objectid.ObjectId):
return str(obj)
else:
return super(JSONEncoder, self).default(obj)
@ -51,12 +51,22 @@ def index():
status = {
'tracking': kernel.db.tracked.count(),
'need_update': kernel.get_update_queue_size(),
'logs': list(kernel.db.logs.find().sort('$natural', -1).limit(100)),
'logs': list(kernel.db.logs.find().sort('$natural', -1).limit(20)),
'mongo_host': settings.MONGO_HOST,
}
return status
@app.route('/status/')
@api_wrapper()
def doc_list():
status = {
'tracking': kernel.db.tracked.count(),
'need_update': kernel.get_update_queue_size(),
}
return status
@app.route('/log/')
@api_wrapper('logs.html')
def log_view():
@ -76,11 +86,23 @@ def tracked():
return json.dumps(tracked, cls=JSONEncoder)
@app.route('/tracked/<id>')
def tracked_view(id):
doc = kernel.db.tracked.find_one({'_id': id})
@app.route('/tracked/<path:url>')
def tracked_view(url):
url = _path_fixer(url)
doc = kernel.db.tracked.find_one({'url': url})
return json.dumps(doc, cls=JSONEncoder)
@app.route('/doc/<path:url>/<version>')
def show_doc(url, version):
url = _path_fixer(url)
if version == 'latest':
version = -1
doc = kernel.get_version(url, version)
resp = flask.make_response(doc.read())
resp.headers['content-type'] = doc.content_type
return resp
if __name__ == '__main__':
app.run(debug=True)

View File

@ -2,4 +2,4 @@ scrapelib
pymongo>=2.0
flask
nose
celery==2.5.3
celery

View File

@ -1,21 +1,11 @@
#!/usr/bin/env python
import os
from setuptools import setup
# Hack to prevent stupid "TypeError: 'NoneType' object is not callable" error
# in multiprocessing/util.py _exit_function when running `python
# setup.py test` (see
# http://www.eby-sarna.com/pipermail/peak/2010-May/003357.html)
try:
import multiprocessing
except ImportError:
pass
from oyster import __version__
long_description = open('README.rst').read()
setup(name="oyster",
version='0.4.0-dev',
version=__version__,
py_modules=['oyster'],
author="James Turk",
author_email='jturk@sunlightfoundation.com',
@ -31,10 +21,8 @@ setup(name="oyster",
"Operating System :: OS Independent",
"Programming Language :: Python",
],
install_requires=["httplib2 >= 0.6.0", "scrapelib >= 0.7.2",
install_requires=["httplib2 >= 0.6.0", "scrapelib >= 0.5.4",
"pymongo >= 1.11", "flask", "celery"],
tests_require=["nose"],
test_suite='nose.collector',
entry_points="""
[console_scripts]
scrapeshell = scrapelib:scrapeshell

10
tox.ini
View File

@ -1,10 +0,0 @@
# Tox (http://codespeak.net/~hpk/tox/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
envlist = py26, py27, pypy
[testenv]
commands = python setup.py test