flake8 fixes: mostly whitespace, a few real bugs
This commit is contained in:
parent
45a93fcc68
commit
981e2cc88f
@ -1,5 +1,6 @@
|
||||
from oyster.conf import default_settings
|
||||
|
||||
|
||||
class Settings(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
@ -2,13 +2,13 @@ import datetime
|
||||
import hashlib
|
||||
import random
|
||||
import sys
|
||||
import urllib
|
||||
|
||||
import pymongo
|
||||
import scrapelib
|
||||
|
||||
from .storage import engines
|
||||
|
||||
|
||||
class Kernel(object):
|
||||
""" oyster's workhorse, handles tracking """
|
||||
|
||||
@ -62,14 +62,12 @@ class Kernel(object):
|
||||
raise ValueError('doc_class %s missing key %s' %
|
||||
(dc_name, key))
|
||||
|
||||
|
||||
def _wipe(self):
|
||||
""" exists primarily for debug use, wipes entire db """
|
||||
self.db.drop_collection('tracked')
|
||||
self.db.drop_collection('logs')
|
||||
self.db.drop_collection('status')
|
||||
|
||||
|
||||
def log(self, action, url, error=False, **kwargs):
|
||||
""" add an entry to the oyster log """
|
||||
kwargs['action'] = action
|
||||
@ -78,11 +76,9 @@ class Kernel(object):
|
||||
kwargs['timestamp'] = datetime.datetime.utcnow()
|
||||
self.db.logs.insert(kwargs)
|
||||
|
||||
|
||||
def _add_doc_class(self, doc_class, **properties):
|
||||
self.doc_classes[doc_class] = properties
|
||||
|
||||
|
||||
def track_url(self, url, doc_class, id=None, **kwargs):
|
||||
"""
|
||||
Add a URL to the set of tracked URLs, accessible via a given filename.
|
||||
@ -123,14 +119,12 @@ class Kernel(object):
|
||||
newdoc['_id'] = id
|
||||
return self.db.tracked.insert(newdoc)
|
||||
|
||||
|
||||
def md5_versioning(self, olddata, newdata):
|
||||
""" return True if md5 changed or if file is new """
|
||||
old_md5 = hashlib.md5(olddata).hexdigest()
|
||||
new_md5 = hashlib.md5(newdata).hexdigest()
|
||||
return old_md5 != new_md5
|
||||
|
||||
|
||||
def update(self, doc):
|
||||
"""
|
||||
perform update upon a given document
|
||||
@ -187,7 +181,7 @@ class Kernel(object):
|
||||
c_errors = doc.get('consecutive_errors', 0)
|
||||
doc['consecutive_errors'] = c_errors + 1
|
||||
if c_errors <= self.retry_attempts:
|
||||
update_mins = self.retry_wait_minutes * (2**c_errors)
|
||||
update_mins = self.retry_wait_minutes * (2 ** c_errors)
|
||||
else:
|
||||
# reset error count if all was ok
|
||||
doc['consecutive_errors'] = 0
|
||||
@ -203,7 +197,6 @@ class Kernel(object):
|
||||
|
||||
self.db.tracked.save(doc, safe=True)
|
||||
|
||||
|
||||
def get_update_queue(self):
|
||||
"""
|
||||
Get a list of what needs to be updated.
|
||||
@ -229,7 +222,6 @@ class Kernel(object):
|
||||
|
||||
return queue
|
||||
|
||||
|
||||
def get_update_queue_size(self):
|
||||
"""
|
||||
Get the size of the update queue, this should match
|
||||
@ -240,7 +232,7 @@ class Kernel(object):
|
||||
{'next_update': {'$ne': None}},
|
||||
{'next_update': {'$lt': datetime.datetime.utcnow()}},
|
||||
]}).count()
|
||||
return new+next
|
||||
return new + next
|
||||
|
||||
def get_last_version(self, doc):
|
||||
try:
|
||||
@ -251,7 +243,6 @@ class Kernel(object):
|
||||
return storage.get(doc['versions'][-1]['storage_key'])
|
||||
|
||||
|
||||
|
||||
def _get_configured_kernel():
|
||||
""" factory, gets a connection configured with oyster.conf.settings """
|
||||
from oyster.conf import settings
|
||||
|
@ -3,6 +3,7 @@ import argparse
|
||||
|
||||
from oyster.core import kernel
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='do a task for all documents in a doc_class',
|
||||
@ -14,10 +15,10 @@ def main():
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
docs = kernel.db.tracked.find({'doc_class':args.doc_class})
|
||||
docs = kernel.db.tracked.find({'doc_class': args.doc_class})
|
||||
print '%s docs in %s' % (docs.count(), args.doc_class)
|
||||
|
||||
path, func = args.function.rsplit('.',1)
|
||||
path, func = args.function.rsplit('.', 1)
|
||||
mod = __import__(path, fromlist=[func])
|
||||
func = getattr(mod, func)
|
||||
|
||||
|
@ -1,8 +1,3 @@
|
||||
import urllib
|
||||
import boto
|
||||
from oyster.conf import settings
|
||||
|
||||
|
||||
class DummyStorage(object):
|
||||
""" should NOT be used outside of testing """
|
||||
|
||||
|
@ -2,8 +2,8 @@ from __future__ import absolute_import
|
||||
|
||||
import gridfs
|
||||
|
||||
class GridFSStorage(object):
|
||||
|
||||
class GridFSStorage(object):
|
||||
storage_type = 'gridfs'
|
||||
|
||||
def __init__(self, kernel):
|
||||
|
@ -13,7 +13,7 @@ class S3Storage(object):
|
||||
def put(self, tracked_doc, data, content_type):
|
||||
""" upload the document to S3 """
|
||||
k = boto.s3.key.Key(self.bucket)
|
||||
key_name = getattr(settings, AWS_PREFIX, '') + tracked_doc['_id']
|
||||
key_name = getattr(settings, 'AWS_PREFIX', '') + tracked_doc['_id']
|
||||
k.key = key_name
|
||||
headers = {'x-amz-acl': 'public-read',
|
||||
'Content-Type': content_type}
|
||||
|
@ -1,8 +1,6 @@
|
||||
from celery.task.base import Task, PeriodicTask
|
||||
from celery.execute import send_task
|
||||
|
||||
from pymongo.objectid import ObjectId
|
||||
|
||||
from oyster.core import kernel
|
||||
|
||||
|
||||
@ -15,7 +13,7 @@ class UpdateTask(Task):
|
||||
doc = kernel.db.tracked.find_one({'_id': doc_id})
|
||||
kernel.update(doc)
|
||||
for task in doc.get('post_update_tasks', []):
|
||||
send_task(hook, (doc_id,))
|
||||
send_task(task, (doc_id,))
|
||||
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
|
||||
# don't sit on a connection
|
||||
kernel.db.connection.end_request()
|
||||
|
@ -3,14 +3,16 @@ import datetime
|
||||
from unittest import TestCase
|
||||
|
||||
from nose.tools import assert_raises
|
||||
import pymongo
|
||||
|
||||
from oyster.core import Kernel
|
||||
|
||||
|
||||
def hook_fired(doc, newdata):
|
||||
doc['hook_fired'] = doc.get('hook_fired', 0) + 1
|
||||
|
||||
RANDOM_URL = 'http://www.random.org/integers/?num=1&min=-1000000000&max=1000000000&col=1&base=10&format=plain&rnd=new'
|
||||
RANDOM_URL = ('http://www.random.org/integers/?num=1&min=-1000000000&'
|
||||
'max=1000000000&col=1&base=10&format=plain&rnd=new')
|
||||
|
||||
|
||||
class KernelTests(TestCase):
|
||||
|
||||
@ -20,7 +22,7 @@ class KernelTests(TestCase):
|
||||
'onchanged': []
|
||||
},
|
||||
'fast-update':
|
||||
{'update_mins': 1/60., 'storage_engine': 'dummy',
|
||||
{'update_mins': 1 / 60., 'storage_engine': 'dummy',
|
||||
'onchanged': []
|
||||
},
|
||||
'one-time':
|
||||
@ -32,7 +34,8 @@ class KernelTests(TestCase):
|
||||
'onchanged': [hook_fired]
|
||||
}
|
||||
}
|
||||
self.kernel = Kernel(mongo_db='oyster_test', retry_wait_minutes=1/60.,
|
||||
self.kernel = Kernel(mongo_db='oyster_test',
|
||||
retry_wait_minutes=1 / 60.,
|
||||
doc_classes=doc_classes)
|
||||
self.kernel._wipe()
|
||||
|
||||
@ -53,7 +56,6 @@ class KernelTests(TestCase):
|
||||
# ensure that a bad document class raises an error
|
||||
assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}})
|
||||
|
||||
|
||||
def test_log(self):
|
||||
self.kernel.log('action1', 'http://example.com')
|
||||
self.kernel.log('action2', 'http://test.com', error=True, pi=3)
|
||||
@ -63,7 +65,6 @@ class KernelTests(TestCase):
|
||||
assert x['url'] == 'http://test.com'
|
||||
assert x['pi'] == 3
|
||||
|
||||
|
||||
def test_track_url(self):
|
||||
# basic insert
|
||||
id1 = self.kernel.track_url('http://example.com', 'default', pi=3)
|
||||
@ -105,7 +106,6 @@ class KernelTests(TestCase):
|
||||
# logged error
|
||||
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
||||
|
||||
|
||||
def test_no_update(self):
|
||||
# update
|
||||
self.kernel.track_url('http://example.com', 'one-time')
|
||||
@ -122,7 +122,6 @@ class KernelTests(TestCase):
|
||||
assert not self.kernel.md5_versioning('hello!', 'hello!')
|
||||
assert self.kernel.md5_versioning('hello!', 'hey!')
|
||||
|
||||
|
||||
def test_update(self):
|
||||
# get a single document tracked and call update on it
|
||||
self.kernel.track_url('http://example.com', 'default')
|
||||
@ -154,7 +153,6 @@ class KernelTests(TestCase):
|
||||
# check that logs updated
|
||||
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
|
||||
|
||||
|
||||
def test_update_failure(self):
|
||||
# track a non-existent URL
|
||||
self.kernel.track_url('http://not_a_url', 'default')
|
||||
@ -174,7 +172,6 @@ class KernelTests(TestCase):
|
||||
obj = self.kernel.db.tracked.find_one()
|
||||
assert obj['consecutive_errors'] == 2
|
||||
|
||||
|
||||
def test_update_onchanged_fire_only_on_change(self):
|
||||
self.kernel.track_url('http://example.com', 'change-hook')
|
||||
obj = self.kernel.db.tracked.find_one()
|
||||
@ -201,7 +198,6 @@ class KernelTests(TestCase):
|
||||
doc = self.kernel.db.tracked.find_one()
|
||||
assert doc['hook_fired'] == 2
|
||||
|
||||
|
||||
def test_get_update_queue(self):
|
||||
self.kernel.track_url('never-updates', 'fast-update')
|
||||
self.kernel.track_url('bad-uri', 'fast-update')
|
||||
@ -229,15 +225,12 @@ class KernelTests(TestCase):
|
||||
queue = self.kernel.get_update_queue()
|
||||
assert len(queue) == 3
|
||||
|
||||
|
||||
def test_get_update_queue_size(self):
|
||||
self.kernel.track_url('a', 'fast-update')
|
||||
self.kernel.track_url('b', 'fast-update')
|
||||
self.kernel.track_url('c', 'fast-update')
|
||||
|
||||
a = self.kernel.db.tracked.find_one(dict(url='a'))
|
||||
b = self.kernel.db.tracked.find_one(dict(url='b'))
|
||||
c = self.kernel.db.tracked.find_one(dict(url='c'))
|
||||
|
||||
# size should start at 3
|
||||
assert self.kernel.get_update_queue_size() == 3
|
||||
|
@ -3,6 +3,7 @@ from oyster.storage.s3 import S3Storage
|
||||
from oyster.storage.gridfs import GridFSStorage
|
||||
from oyster.storage.dummy import DummyStorage
|
||||
|
||||
|
||||
def _simple_storage_test(StorageCls):
|
||||
kernel = Kernel(mongo_db='oyster_test')
|
||||
storage = StorageCls(kernel)
|
||||
|
@ -44,6 +44,7 @@ def api_wrapper(template=None):
|
||||
|
||||
app = flask.Flask('oyster')
|
||||
|
||||
|
||||
@app.route('/')
|
||||
@api_wrapper('index.html')
|
||||
def index():
|
||||
|
Loading…
Reference in New Issue
Block a user