flake8 fixes: mostly whitespace, a few real bugs

This commit is contained in:
James Turk 2012-03-10 11:21:43 -08:00
parent 45a93fcc68
commit 981e2cc88f
10 changed files with 19 additions and 38 deletions

View File

@ -1,5 +1,6 @@
from oyster.conf import default_settings from oyster.conf import default_settings
class Settings(object): class Settings(object):
def __init__(self): def __init__(self):
pass pass

View File

@ -2,13 +2,13 @@ import datetime
import hashlib import hashlib
import random import random
import sys import sys
import urllib
import pymongo import pymongo
import scrapelib import scrapelib
from .storage import engines from .storage import engines
class Kernel(object): class Kernel(object):
""" oyster's workhorse, handles tracking """ """ oyster's workhorse, handles tracking """
@ -62,14 +62,12 @@ class Kernel(object):
raise ValueError('doc_class %s missing key %s' % raise ValueError('doc_class %s missing key %s' %
(dc_name, key)) (dc_name, key))
def _wipe(self): def _wipe(self):
""" exists primarily for debug use, wipes entire db """ """ exists primarily for debug use, wipes entire db """
self.db.drop_collection('tracked') self.db.drop_collection('tracked')
self.db.drop_collection('logs') self.db.drop_collection('logs')
self.db.drop_collection('status') self.db.drop_collection('status')
def log(self, action, url, error=False, **kwargs): def log(self, action, url, error=False, **kwargs):
""" add an entry to the oyster log """ """ add an entry to the oyster log """
kwargs['action'] = action kwargs['action'] = action
@ -78,11 +76,9 @@ class Kernel(object):
kwargs['timestamp'] = datetime.datetime.utcnow() kwargs['timestamp'] = datetime.datetime.utcnow()
self.db.logs.insert(kwargs) self.db.logs.insert(kwargs)
def _add_doc_class(self, doc_class, **properties): def _add_doc_class(self, doc_class, **properties):
self.doc_classes[doc_class] = properties self.doc_classes[doc_class] = properties
def track_url(self, url, doc_class, id=None, **kwargs): def track_url(self, url, doc_class, id=None, **kwargs):
""" """
Add a URL to the set of tracked URLs, accessible via a given filename. Add a URL to the set of tracked URLs, accessible via a given filename.
@ -123,14 +119,12 @@ class Kernel(object):
newdoc['_id'] = id newdoc['_id'] = id
return self.db.tracked.insert(newdoc) return self.db.tracked.insert(newdoc)
def md5_versioning(self, olddata, newdata): def md5_versioning(self, olddata, newdata):
""" return True if md5 changed or if file is new """ """ return True if md5 changed or if file is new """
old_md5 = hashlib.md5(olddata).hexdigest() old_md5 = hashlib.md5(olddata).hexdigest()
new_md5 = hashlib.md5(newdata).hexdigest() new_md5 = hashlib.md5(newdata).hexdigest()
return old_md5 != new_md5 return old_md5 != new_md5
def update(self, doc): def update(self, doc):
""" """
perform update upon a given document perform update upon a given document
@ -187,7 +181,7 @@ class Kernel(object):
c_errors = doc.get('consecutive_errors', 0) c_errors = doc.get('consecutive_errors', 0)
doc['consecutive_errors'] = c_errors + 1 doc['consecutive_errors'] = c_errors + 1
if c_errors <= self.retry_attempts: if c_errors <= self.retry_attempts:
update_mins = self.retry_wait_minutes * (2**c_errors) update_mins = self.retry_wait_minutes * (2 ** c_errors)
else: else:
# reset error count if all was ok # reset error count if all was ok
doc['consecutive_errors'] = 0 doc['consecutive_errors'] = 0
@ -203,7 +197,6 @@ class Kernel(object):
self.db.tracked.save(doc, safe=True) self.db.tracked.save(doc, safe=True)
def get_update_queue(self): def get_update_queue(self):
""" """
Get a list of what needs to be updated. Get a list of what needs to be updated.
@ -229,7 +222,6 @@ class Kernel(object):
return queue return queue
def get_update_queue_size(self): def get_update_queue_size(self):
""" """
Get the size of the update queue, this should match Get the size of the update queue, this should match
@ -240,7 +232,7 @@ class Kernel(object):
{'next_update': {'$ne': None}}, {'next_update': {'$ne': None}},
{'next_update': {'$lt': datetime.datetime.utcnow()}}, {'next_update': {'$lt': datetime.datetime.utcnow()}},
]}).count() ]}).count()
return new+next return new + next
def get_last_version(self, doc): def get_last_version(self, doc):
try: try:
@ -251,7 +243,6 @@ class Kernel(object):
return storage.get(doc['versions'][-1]['storage_key']) return storage.get(doc['versions'][-1]['storage_key'])
def _get_configured_kernel(): def _get_configured_kernel():
""" factory, gets a connection configured with oyster.conf.settings """ """ factory, gets a connection configured with oyster.conf.settings """
from oyster.conf import settings from oyster.conf import settings

View File

@ -3,6 +3,7 @@ import argparse
from oyster.core import kernel from oyster.core import kernel
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='do a task for all documents in a doc_class', description='do a task for all documents in a doc_class',
@ -14,10 +15,10 @@ def main():
args = parser.parse_args() args = parser.parse_args()
docs = kernel.db.tracked.find({'doc_class':args.doc_class}) docs = kernel.db.tracked.find({'doc_class': args.doc_class})
print '%s docs in %s' % (docs.count(), args.doc_class) print '%s docs in %s' % (docs.count(), args.doc_class)
path, func = args.function.rsplit('.',1) path, func = args.function.rsplit('.', 1)
mod = __import__(path, fromlist=[func]) mod = __import__(path, fromlist=[func])
func = getattr(mod, func) func = getattr(mod, func)

View File

@ -1,8 +1,3 @@
import urllib
import boto
from oyster.conf import settings
class DummyStorage(object): class DummyStorage(object):
""" should NOT be used outside of testing """ """ should NOT be used outside of testing """

View File

@ -2,8 +2,8 @@ from __future__ import absolute_import
import gridfs import gridfs
class GridFSStorage(object):
class GridFSStorage(object):
storage_type = 'gridfs' storage_type = 'gridfs'
def __init__(self, kernel): def __init__(self, kernel):

View File

@ -13,7 +13,7 @@ class S3Storage(object):
def put(self, tracked_doc, data, content_type): def put(self, tracked_doc, data, content_type):
""" upload the document to S3 """ """ upload the document to S3 """
k = boto.s3.key.Key(self.bucket) k = boto.s3.key.Key(self.bucket)
key_name = getattr(settings, AWS_PREFIX, '') + tracked_doc['_id'] key_name = getattr(settings, 'AWS_PREFIX', '') + tracked_doc['_id']
k.key = key_name k.key = key_name
headers = {'x-amz-acl': 'public-read', headers = {'x-amz-acl': 'public-read',
'Content-Type': content_type} 'Content-Type': content_type}

View File

@ -1,8 +1,6 @@
from celery.task.base import Task, PeriodicTask from celery.task.base import Task, PeriodicTask
from celery.execute import send_task from celery.execute import send_task
from pymongo.objectid import ObjectId
from oyster.core import kernel from oyster.core import kernel
@ -15,7 +13,7 @@ class UpdateTask(Task):
doc = kernel.db.tracked.find_one({'_id': doc_id}) doc = kernel.db.tracked.find_one({'_id': doc_id})
kernel.update(doc) kernel.update(doc)
for task in doc.get('post_update_tasks', []): for task in doc.get('post_update_tasks', []):
send_task(hook, (doc_id,)) send_task(task, (doc_id,))
kernel.db.status.update({}, {'$inc': {'update_queue': -1}}) kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
# don't sit on a connection # don't sit on a connection
kernel.db.connection.end_request() kernel.db.connection.end_request()

View File

@ -3,14 +3,16 @@ import datetime
from unittest import TestCase from unittest import TestCase
from nose.tools import assert_raises from nose.tools import assert_raises
import pymongo
from oyster.core import Kernel from oyster.core import Kernel
def hook_fired(doc, newdata): def hook_fired(doc, newdata):
doc['hook_fired'] = doc.get('hook_fired', 0) + 1 doc['hook_fired'] = doc.get('hook_fired', 0) + 1
RANDOM_URL = 'http://www.random.org/integers/?num=1&min=-1000000000&max=1000000000&col=1&base=10&format=plain&rnd=new' RANDOM_URL = ('http://www.random.org/integers/?num=1&min=-1000000000&'
'max=1000000000&col=1&base=10&format=plain&rnd=new')
class KernelTests(TestCase): class KernelTests(TestCase):
@ -20,7 +22,7 @@ class KernelTests(TestCase):
'onchanged': [] 'onchanged': []
}, },
'fast-update': 'fast-update':
{'update_mins': 1/60., 'storage_engine': 'dummy', {'update_mins': 1 / 60., 'storage_engine': 'dummy',
'onchanged': [] 'onchanged': []
}, },
'one-time': 'one-time':
@ -32,7 +34,8 @@ class KernelTests(TestCase):
'onchanged': [hook_fired] 'onchanged': [hook_fired]
} }
} }
self.kernel = Kernel(mongo_db='oyster_test', retry_wait_minutes=1/60., self.kernel = Kernel(mongo_db='oyster_test',
retry_wait_minutes=1 / 60.,
doc_classes=doc_classes) doc_classes=doc_classes)
self.kernel._wipe() self.kernel._wipe()
@ -53,7 +56,6 @@ class KernelTests(TestCase):
# ensure that a bad document class raises an error # ensure that a bad document class raises an error
assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}}) assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}})
def test_log(self): def test_log(self):
self.kernel.log('action1', 'http://example.com') self.kernel.log('action1', 'http://example.com')
self.kernel.log('action2', 'http://test.com', error=True, pi=3) self.kernel.log('action2', 'http://test.com', error=True, pi=3)
@ -63,7 +65,6 @@ class KernelTests(TestCase):
assert x['url'] == 'http://test.com' assert x['url'] == 'http://test.com'
assert x['pi'] == 3 assert x['pi'] == 3
def test_track_url(self): def test_track_url(self):
# basic insert # basic insert
id1 = self.kernel.track_url('http://example.com', 'default', pi=3) id1 = self.kernel.track_url('http://example.com', 'default', pi=3)
@ -105,7 +106,6 @@ class KernelTests(TestCase):
# logged error # logged error
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'}) assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
def test_no_update(self): def test_no_update(self):
# update # update
self.kernel.track_url('http://example.com', 'one-time') self.kernel.track_url('http://example.com', 'one-time')
@ -122,7 +122,6 @@ class KernelTests(TestCase):
assert not self.kernel.md5_versioning('hello!', 'hello!') assert not self.kernel.md5_versioning('hello!', 'hello!')
assert self.kernel.md5_versioning('hello!', 'hey!') assert self.kernel.md5_versioning('hello!', 'hey!')
def test_update(self): def test_update(self):
# get a single document tracked and call update on it # get a single document tracked and call update on it
self.kernel.track_url('http://example.com', 'default') self.kernel.track_url('http://example.com', 'default')
@ -154,7 +153,6 @@ class KernelTests(TestCase):
# check that logs updated # check that logs updated
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2 assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
def test_update_failure(self): def test_update_failure(self):
# track a non-existent URL # track a non-existent URL
self.kernel.track_url('http://not_a_url', 'default') self.kernel.track_url('http://not_a_url', 'default')
@ -174,7 +172,6 @@ class KernelTests(TestCase):
obj = self.kernel.db.tracked.find_one() obj = self.kernel.db.tracked.find_one()
assert obj['consecutive_errors'] == 2 assert obj['consecutive_errors'] == 2
def test_update_onchanged_fire_only_on_change(self): def test_update_onchanged_fire_only_on_change(self):
self.kernel.track_url('http://example.com', 'change-hook') self.kernel.track_url('http://example.com', 'change-hook')
obj = self.kernel.db.tracked.find_one() obj = self.kernel.db.tracked.find_one()
@ -201,7 +198,6 @@ class KernelTests(TestCase):
doc = self.kernel.db.tracked.find_one() doc = self.kernel.db.tracked.find_one()
assert doc['hook_fired'] == 2 assert doc['hook_fired'] == 2
def test_get_update_queue(self): def test_get_update_queue(self):
self.kernel.track_url('never-updates', 'fast-update') self.kernel.track_url('never-updates', 'fast-update')
self.kernel.track_url('bad-uri', 'fast-update') self.kernel.track_url('bad-uri', 'fast-update')
@ -229,15 +225,12 @@ class KernelTests(TestCase):
queue = self.kernel.get_update_queue() queue = self.kernel.get_update_queue()
assert len(queue) == 3 assert len(queue) == 3
def test_get_update_queue_size(self): def test_get_update_queue_size(self):
self.kernel.track_url('a', 'fast-update') self.kernel.track_url('a', 'fast-update')
self.kernel.track_url('b', 'fast-update') self.kernel.track_url('b', 'fast-update')
self.kernel.track_url('c', 'fast-update') self.kernel.track_url('c', 'fast-update')
a = self.kernel.db.tracked.find_one(dict(url='a')) a = self.kernel.db.tracked.find_one(dict(url='a'))
b = self.kernel.db.tracked.find_one(dict(url='b'))
c = self.kernel.db.tracked.find_one(dict(url='c'))
# size should start at 3 # size should start at 3
assert self.kernel.get_update_queue_size() == 3 assert self.kernel.get_update_queue_size() == 3

View File

@ -3,6 +3,7 @@ from oyster.storage.s3 import S3Storage
from oyster.storage.gridfs import GridFSStorage from oyster.storage.gridfs import GridFSStorage
from oyster.storage.dummy import DummyStorage from oyster.storage.dummy import DummyStorage
def _simple_storage_test(StorageCls): def _simple_storage_test(StorageCls):
kernel = Kernel(mongo_db='oyster_test') kernel = Kernel(mongo_db='oyster_test')
storage = StorageCls(kernel) storage = StorageCls(kernel)

View File

@ -44,6 +44,7 @@ def api_wrapper(template=None):
app = flask.Flask('oyster') app = flask.Flask('oyster')
@app.route('/') @app.route('/')
@api_wrapper('index.html') @api_wrapper('index.html')
def index(): def index():