flake8 fixes: mostly whitespace, a few real bugs

This commit is contained in:
James Turk 2012-03-10 11:21:43 -08:00
parent 45a93fcc68
commit 981e2cc88f
10 changed files with 19 additions and 38 deletions

View File

@ -1,5 +1,6 @@
from oyster.conf import default_settings
class Settings(object):
def __init__(self):
pass

View File

@ -2,13 +2,13 @@ import datetime
import hashlib
import random
import sys
import urllib
import pymongo
import scrapelib
from .storage import engines
class Kernel(object):
""" oyster's workhorse, handles tracking """
@ -62,14 +62,12 @@ class Kernel(object):
raise ValueError('doc_class %s missing key %s' %
(dc_name, key))
def _wipe(self):
""" exists primarily for debug use, wipes entire db """
self.db.drop_collection('tracked')
self.db.drop_collection('logs')
self.db.drop_collection('status')
def log(self, action, url, error=False, **kwargs):
""" add an entry to the oyster log """
kwargs['action'] = action
@ -78,11 +76,9 @@ class Kernel(object):
kwargs['timestamp'] = datetime.datetime.utcnow()
self.db.logs.insert(kwargs)
def _add_doc_class(self, doc_class, **properties):
self.doc_classes[doc_class] = properties
def track_url(self, url, doc_class, id=None, **kwargs):
"""
Add a URL to the set of tracked URLs, accessible via a given filename.
@ -123,14 +119,12 @@ class Kernel(object):
newdoc['_id'] = id
return self.db.tracked.insert(newdoc)
def md5_versioning(self, olddata, newdata):
""" return True if md5 changed or if file is new """
old_md5 = hashlib.md5(olddata).hexdigest()
new_md5 = hashlib.md5(newdata).hexdigest()
return old_md5 != new_md5
def update(self, doc):
"""
perform update upon a given document
@ -187,7 +181,7 @@ class Kernel(object):
c_errors = doc.get('consecutive_errors', 0)
doc['consecutive_errors'] = c_errors + 1
if c_errors <= self.retry_attempts:
update_mins = self.retry_wait_minutes * (2**c_errors)
update_mins = self.retry_wait_minutes * (2 ** c_errors)
else:
# reset error count if all was ok
doc['consecutive_errors'] = 0
@ -203,7 +197,6 @@ class Kernel(object):
self.db.tracked.save(doc, safe=True)
def get_update_queue(self):
"""
Get a list of what needs to be updated.
@ -229,7 +222,6 @@ class Kernel(object):
return queue
def get_update_queue_size(self):
"""
Get the size of the update queue, this should match
@ -240,7 +232,7 @@ class Kernel(object):
{'next_update': {'$ne': None}},
{'next_update': {'$lt': datetime.datetime.utcnow()}},
]}).count()
return new+next
return new + next
def get_last_version(self, doc):
try:
@ -251,7 +243,6 @@ class Kernel(object):
return storage.get(doc['versions'][-1]['storage_key'])
def _get_configured_kernel():
""" factory, gets a connection configured with oyster.conf.settings """
from oyster.conf import settings

View File

@ -3,6 +3,7 @@ import argparse
from oyster.core import kernel
def main():
parser = argparse.ArgumentParser(
description='do a task for all documents in a doc_class',
@ -14,10 +15,10 @@ def main():
args = parser.parse_args()
docs = kernel.db.tracked.find({'doc_class':args.doc_class})
docs = kernel.db.tracked.find({'doc_class': args.doc_class})
print '%s docs in %s' % (docs.count(), args.doc_class)
path, func = args.function.rsplit('.',1)
path, func = args.function.rsplit('.', 1)
mod = __import__(path, fromlist=[func])
func = getattr(mod, func)

View File

@ -1,8 +1,3 @@
import urllib
import boto
from oyster.conf import settings
class DummyStorage(object):
""" should NOT be used outside of testing """

View File

@ -2,8 +2,8 @@ from __future__ import absolute_import
import gridfs
class GridFSStorage(object):
class GridFSStorage(object):
storage_type = 'gridfs'
def __init__(self, kernel):

View File

@ -13,7 +13,7 @@ class S3Storage(object):
def put(self, tracked_doc, data, content_type):
""" upload the document to S3 """
k = boto.s3.key.Key(self.bucket)
key_name = getattr(settings, AWS_PREFIX, '') + tracked_doc['_id']
key_name = getattr(settings, 'AWS_PREFIX', '') + tracked_doc['_id']
k.key = key_name
headers = {'x-amz-acl': 'public-read',
'Content-Type': content_type}

View File

@ -1,8 +1,6 @@
from celery.task.base import Task, PeriodicTask
from celery.execute import send_task
from pymongo.objectid import ObjectId
from oyster.core import kernel
@ -15,7 +13,7 @@ class UpdateTask(Task):
doc = kernel.db.tracked.find_one({'_id': doc_id})
kernel.update(doc)
for task in doc.get('post_update_tasks', []):
send_task(hook, (doc_id,))
send_task(task, (doc_id,))
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
# don't sit on a connection
kernel.db.connection.end_request()

View File

@ -3,14 +3,16 @@ import datetime
from unittest import TestCase
from nose.tools import assert_raises
import pymongo
from oyster.core import Kernel
def hook_fired(doc, newdata):
doc['hook_fired'] = doc.get('hook_fired', 0) + 1
RANDOM_URL = 'http://www.random.org/integers/?num=1&min=-1000000000&max=1000000000&col=1&base=10&format=plain&rnd=new'
RANDOM_URL = ('http://www.random.org/integers/?num=1&min=-1000000000&'
'max=1000000000&col=1&base=10&format=plain&rnd=new')
class KernelTests(TestCase):
@ -20,7 +22,7 @@ class KernelTests(TestCase):
'onchanged': []
},
'fast-update':
{'update_mins': 1/60., 'storage_engine': 'dummy',
{'update_mins': 1 / 60., 'storage_engine': 'dummy',
'onchanged': []
},
'one-time':
@ -32,7 +34,8 @@ class KernelTests(TestCase):
'onchanged': [hook_fired]
}
}
self.kernel = Kernel(mongo_db='oyster_test', retry_wait_minutes=1/60.,
self.kernel = Kernel(mongo_db='oyster_test',
retry_wait_minutes=1 / 60.,
doc_classes=doc_classes)
self.kernel._wipe()
@ -53,7 +56,6 @@ class KernelTests(TestCase):
# ensure that a bad document class raises an error
assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}})
def test_log(self):
self.kernel.log('action1', 'http://example.com')
self.kernel.log('action2', 'http://test.com', error=True, pi=3)
@ -63,7 +65,6 @@ class KernelTests(TestCase):
assert x['url'] == 'http://test.com'
assert x['pi'] == 3
def test_track_url(self):
# basic insert
id1 = self.kernel.track_url('http://example.com', 'default', pi=3)
@ -105,7 +106,6 @@ class KernelTests(TestCase):
# logged error
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
def test_no_update(self):
# update
self.kernel.track_url('http://example.com', 'one-time')
@ -122,7 +122,6 @@ class KernelTests(TestCase):
assert not self.kernel.md5_versioning('hello!', 'hello!')
assert self.kernel.md5_versioning('hello!', 'hey!')
def test_update(self):
# get a single document tracked and call update on it
self.kernel.track_url('http://example.com', 'default')
@ -154,7 +153,6 @@ class KernelTests(TestCase):
# check that logs updated
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
def test_update_failure(self):
# track a non-existent URL
self.kernel.track_url('http://not_a_url', 'default')
@ -174,7 +172,6 @@ class KernelTests(TestCase):
obj = self.kernel.db.tracked.find_one()
assert obj['consecutive_errors'] == 2
def test_update_onchanged_fire_only_on_change(self):
self.kernel.track_url('http://example.com', 'change-hook')
obj = self.kernel.db.tracked.find_one()
@ -201,7 +198,6 @@ class KernelTests(TestCase):
doc = self.kernel.db.tracked.find_one()
assert doc['hook_fired'] == 2
def test_get_update_queue(self):
self.kernel.track_url('never-updates', 'fast-update')
self.kernel.track_url('bad-uri', 'fast-update')
@ -229,15 +225,12 @@ class KernelTests(TestCase):
queue = self.kernel.get_update_queue()
assert len(queue) == 3
def test_get_update_queue_size(self):
self.kernel.track_url('a', 'fast-update')
self.kernel.track_url('b', 'fast-update')
self.kernel.track_url('c', 'fast-update')
a = self.kernel.db.tracked.find_one(dict(url='a'))
b = self.kernel.db.tracked.find_one(dict(url='b'))
c = self.kernel.db.tracked.find_one(dict(url='c'))
# size should start at 3
assert self.kernel.get_update_queue_size() == 3

View File

@ -3,6 +3,7 @@ from oyster.storage.s3 import S3Storage
from oyster.storage.gridfs import GridFSStorage
from oyster.storage.dummy import DummyStorage
def _simple_storage_test(StorageCls):
kernel = Kernel(mongo_db='oyster_test')
storage = StorageCls(kernel)

View File

@ -44,6 +44,7 @@ def api_wrapper(template=None):
app = flask.Flask('oyster')
@app.route('/')
@api_wrapper('index.html')
def index():