flake8 fixes: mostly whitespace, a few real bugs
This commit is contained in:
parent
45a93fcc68
commit
981e2cc88f
@ -1,5 +1,6 @@
|
|||||||
from oyster.conf import default_settings
|
from oyster.conf import default_settings
|
||||||
|
|
||||||
|
|
||||||
class Settings(object):
|
class Settings(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
@ -2,13 +2,13 @@ import datetime
|
|||||||
import hashlib
|
import hashlib
|
||||||
import random
|
import random
|
||||||
import sys
|
import sys
|
||||||
import urllib
|
|
||||||
|
|
||||||
import pymongo
|
import pymongo
|
||||||
import scrapelib
|
import scrapelib
|
||||||
|
|
||||||
from .storage import engines
|
from .storage import engines
|
||||||
|
|
||||||
|
|
||||||
class Kernel(object):
|
class Kernel(object):
|
||||||
""" oyster's workhorse, handles tracking """
|
""" oyster's workhorse, handles tracking """
|
||||||
|
|
||||||
@ -62,14 +62,12 @@ class Kernel(object):
|
|||||||
raise ValueError('doc_class %s missing key %s' %
|
raise ValueError('doc_class %s missing key %s' %
|
||||||
(dc_name, key))
|
(dc_name, key))
|
||||||
|
|
||||||
|
|
||||||
def _wipe(self):
|
def _wipe(self):
|
||||||
""" exists primarily for debug use, wipes entire db """
|
""" exists primarily for debug use, wipes entire db """
|
||||||
self.db.drop_collection('tracked')
|
self.db.drop_collection('tracked')
|
||||||
self.db.drop_collection('logs')
|
self.db.drop_collection('logs')
|
||||||
self.db.drop_collection('status')
|
self.db.drop_collection('status')
|
||||||
|
|
||||||
|
|
||||||
def log(self, action, url, error=False, **kwargs):
|
def log(self, action, url, error=False, **kwargs):
|
||||||
""" add an entry to the oyster log """
|
""" add an entry to the oyster log """
|
||||||
kwargs['action'] = action
|
kwargs['action'] = action
|
||||||
@ -78,11 +76,9 @@ class Kernel(object):
|
|||||||
kwargs['timestamp'] = datetime.datetime.utcnow()
|
kwargs['timestamp'] = datetime.datetime.utcnow()
|
||||||
self.db.logs.insert(kwargs)
|
self.db.logs.insert(kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _add_doc_class(self, doc_class, **properties):
|
def _add_doc_class(self, doc_class, **properties):
|
||||||
self.doc_classes[doc_class] = properties
|
self.doc_classes[doc_class] = properties
|
||||||
|
|
||||||
|
|
||||||
def track_url(self, url, doc_class, id=None, **kwargs):
|
def track_url(self, url, doc_class, id=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Add a URL to the set of tracked URLs, accessible via a given filename.
|
Add a URL to the set of tracked URLs, accessible via a given filename.
|
||||||
@ -123,14 +119,12 @@ class Kernel(object):
|
|||||||
newdoc['_id'] = id
|
newdoc['_id'] = id
|
||||||
return self.db.tracked.insert(newdoc)
|
return self.db.tracked.insert(newdoc)
|
||||||
|
|
||||||
|
|
||||||
def md5_versioning(self, olddata, newdata):
|
def md5_versioning(self, olddata, newdata):
|
||||||
""" return True if md5 changed or if file is new """
|
""" return True if md5 changed or if file is new """
|
||||||
old_md5 = hashlib.md5(olddata).hexdigest()
|
old_md5 = hashlib.md5(olddata).hexdigest()
|
||||||
new_md5 = hashlib.md5(newdata).hexdigest()
|
new_md5 = hashlib.md5(newdata).hexdigest()
|
||||||
return old_md5 != new_md5
|
return old_md5 != new_md5
|
||||||
|
|
||||||
|
|
||||||
def update(self, doc):
|
def update(self, doc):
|
||||||
"""
|
"""
|
||||||
perform update upon a given document
|
perform update upon a given document
|
||||||
@ -203,7 +197,6 @@ class Kernel(object):
|
|||||||
|
|
||||||
self.db.tracked.save(doc, safe=True)
|
self.db.tracked.save(doc, safe=True)
|
||||||
|
|
||||||
|
|
||||||
def get_update_queue(self):
|
def get_update_queue(self):
|
||||||
"""
|
"""
|
||||||
Get a list of what needs to be updated.
|
Get a list of what needs to be updated.
|
||||||
@ -229,7 +222,6 @@ class Kernel(object):
|
|||||||
|
|
||||||
return queue
|
return queue
|
||||||
|
|
||||||
|
|
||||||
def get_update_queue_size(self):
|
def get_update_queue_size(self):
|
||||||
"""
|
"""
|
||||||
Get the size of the update queue, this should match
|
Get the size of the update queue, this should match
|
||||||
@ -251,7 +243,6 @@ class Kernel(object):
|
|||||||
return storage.get(doc['versions'][-1]['storage_key'])
|
return storage.get(doc['versions'][-1]['storage_key'])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _get_configured_kernel():
|
def _get_configured_kernel():
|
||||||
""" factory, gets a connection configured with oyster.conf.settings """
|
""" factory, gets a connection configured with oyster.conf.settings """
|
||||||
from oyster.conf import settings
|
from oyster.conf import settings
|
||||||
|
@ -3,6 +3,7 @@ import argparse
|
|||||||
|
|
||||||
from oyster.core import kernel
|
from oyster.core import kernel
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description='do a task for all documents in a doc_class',
|
description='do a task for all documents in a doc_class',
|
||||||
|
@ -1,8 +1,3 @@
|
|||||||
import urllib
|
|
||||||
import boto
|
|
||||||
from oyster.conf import settings
|
|
||||||
|
|
||||||
|
|
||||||
class DummyStorage(object):
|
class DummyStorage(object):
|
||||||
""" should NOT be used outside of testing """
|
""" should NOT be used outside of testing """
|
||||||
|
|
||||||
|
@ -2,8 +2,8 @@ from __future__ import absolute_import
|
|||||||
|
|
||||||
import gridfs
|
import gridfs
|
||||||
|
|
||||||
class GridFSStorage(object):
|
|
||||||
|
|
||||||
|
class GridFSStorage(object):
|
||||||
storage_type = 'gridfs'
|
storage_type = 'gridfs'
|
||||||
|
|
||||||
def __init__(self, kernel):
|
def __init__(self, kernel):
|
||||||
|
@ -13,7 +13,7 @@ class S3Storage(object):
|
|||||||
def put(self, tracked_doc, data, content_type):
|
def put(self, tracked_doc, data, content_type):
|
||||||
""" upload the document to S3 """
|
""" upload the document to S3 """
|
||||||
k = boto.s3.key.Key(self.bucket)
|
k = boto.s3.key.Key(self.bucket)
|
||||||
key_name = getattr(settings, AWS_PREFIX, '') + tracked_doc['_id']
|
key_name = getattr(settings, 'AWS_PREFIX', '') + tracked_doc['_id']
|
||||||
k.key = key_name
|
k.key = key_name
|
||||||
headers = {'x-amz-acl': 'public-read',
|
headers = {'x-amz-acl': 'public-read',
|
||||||
'Content-Type': content_type}
|
'Content-Type': content_type}
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
from celery.task.base import Task, PeriodicTask
|
from celery.task.base import Task, PeriodicTask
|
||||||
from celery.execute import send_task
|
from celery.execute import send_task
|
||||||
|
|
||||||
from pymongo.objectid import ObjectId
|
|
||||||
|
|
||||||
from oyster.core import kernel
|
from oyster.core import kernel
|
||||||
|
|
||||||
|
|
||||||
@ -15,7 +13,7 @@ class UpdateTask(Task):
|
|||||||
doc = kernel.db.tracked.find_one({'_id': doc_id})
|
doc = kernel.db.tracked.find_one({'_id': doc_id})
|
||||||
kernel.update(doc)
|
kernel.update(doc)
|
||||||
for task in doc.get('post_update_tasks', []):
|
for task in doc.get('post_update_tasks', []):
|
||||||
send_task(hook, (doc_id,))
|
send_task(task, (doc_id,))
|
||||||
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
|
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
|
||||||
# don't sit on a connection
|
# don't sit on a connection
|
||||||
kernel.db.connection.end_request()
|
kernel.db.connection.end_request()
|
||||||
|
@ -3,14 +3,16 @@ import datetime
|
|||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
|
|
||||||
from nose.tools import assert_raises
|
from nose.tools import assert_raises
|
||||||
import pymongo
|
|
||||||
|
|
||||||
from oyster.core import Kernel
|
from oyster.core import Kernel
|
||||||
|
|
||||||
|
|
||||||
def hook_fired(doc, newdata):
|
def hook_fired(doc, newdata):
|
||||||
doc['hook_fired'] = doc.get('hook_fired', 0) + 1
|
doc['hook_fired'] = doc.get('hook_fired', 0) + 1
|
||||||
|
|
||||||
RANDOM_URL = 'http://www.random.org/integers/?num=1&min=-1000000000&max=1000000000&col=1&base=10&format=plain&rnd=new'
|
RANDOM_URL = ('http://www.random.org/integers/?num=1&min=-1000000000&'
|
||||||
|
'max=1000000000&col=1&base=10&format=plain&rnd=new')
|
||||||
|
|
||||||
|
|
||||||
class KernelTests(TestCase):
|
class KernelTests(TestCase):
|
||||||
|
|
||||||
@ -32,7 +34,8 @@ class KernelTests(TestCase):
|
|||||||
'onchanged': [hook_fired]
|
'onchanged': [hook_fired]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.kernel = Kernel(mongo_db='oyster_test', retry_wait_minutes=1/60.,
|
self.kernel = Kernel(mongo_db='oyster_test',
|
||||||
|
retry_wait_minutes=1 / 60.,
|
||||||
doc_classes=doc_classes)
|
doc_classes=doc_classes)
|
||||||
self.kernel._wipe()
|
self.kernel._wipe()
|
||||||
|
|
||||||
@ -53,7 +56,6 @@ class KernelTests(TestCase):
|
|||||||
# ensure that a bad document class raises an error
|
# ensure that a bad document class raises an error
|
||||||
assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}})
|
assert_raises(ValueError, Kernel, doc_classes={'bad-doc': {}})
|
||||||
|
|
||||||
|
|
||||||
def test_log(self):
|
def test_log(self):
|
||||||
self.kernel.log('action1', 'http://example.com')
|
self.kernel.log('action1', 'http://example.com')
|
||||||
self.kernel.log('action2', 'http://test.com', error=True, pi=3)
|
self.kernel.log('action2', 'http://test.com', error=True, pi=3)
|
||||||
@ -63,7 +65,6 @@ class KernelTests(TestCase):
|
|||||||
assert x['url'] == 'http://test.com'
|
assert x['url'] == 'http://test.com'
|
||||||
assert x['pi'] == 3
|
assert x['pi'] == 3
|
||||||
|
|
||||||
|
|
||||||
def test_track_url(self):
|
def test_track_url(self):
|
||||||
# basic insert
|
# basic insert
|
||||||
id1 = self.kernel.track_url('http://example.com', 'default', pi=3)
|
id1 = self.kernel.track_url('http://example.com', 'default', pi=3)
|
||||||
@ -105,7 +106,6 @@ class KernelTests(TestCase):
|
|||||||
# logged error
|
# logged error
|
||||||
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
||||||
|
|
||||||
|
|
||||||
def test_no_update(self):
|
def test_no_update(self):
|
||||||
# update
|
# update
|
||||||
self.kernel.track_url('http://example.com', 'one-time')
|
self.kernel.track_url('http://example.com', 'one-time')
|
||||||
@ -122,7 +122,6 @@ class KernelTests(TestCase):
|
|||||||
assert not self.kernel.md5_versioning('hello!', 'hello!')
|
assert not self.kernel.md5_versioning('hello!', 'hello!')
|
||||||
assert self.kernel.md5_versioning('hello!', 'hey!')
|
assert self.kernel.md5_versioning('hello!', 'hey!')
|
||||||
|
|
||||||
|
|
||||||
def test_update(self):
|
def test_update(self):
|
||||||
# get a single document tracked and call update on it
|
# get a single document tracked and call update on it
|
||||||
self.kernel.track_url('http://example.com', 'default')
|
self.kernel.track_url('http://example.com', 'default')
|
||||||
@ -154,7 +153,6 @@ class KernelTests(TestCase):
|
|||||||
# check that logs updated
|
# check that logs updated
|
||||||
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
|
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
|
||||||
|
|
||||||
|
|
||||||
def test_update_failure(self):
|
def test_update_failure(self):
|
||||||
# track a non-existent URL
|
# track a non-existent URL
|
||||||
self.kernel.track_url('http://not_a_url', 'default')
|
self.kernel.track_url('http://not_a_url', 'default')
|
||||||
@ -174,7 +172,6 @@ class KernelTests(TestCase):
|
|||||||
obj = self.kernel.db.tracked.find_one()
|
obj = self.kernel.db.tracked.find_one()
|
||||||
assert obj['consecutive_errors'] == 2
|
assert obj['consecutive_errors'] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_update_onchanged_fire_only_on_change(self):
|
def test_update_onchanged_fire_only_on_change(self):
|
||||||
self.kernel.track_url('http://example.com', 'change-hook')
|
self.kernel.track_url('http://example.com', 'change-hook')
|
||||||
obj = self.kernel.db.tracked.find_one()
|
obj = self.kernel.db.tracked.find_one()
|
||||||
@ -201,7 +198,6 @@ class KernelTests(TestCase):
|
|||||||
doc = self.kernel.db.tracked.find_one()
|
doc = self.kernel.db.tracked.find_one()
|
||||||
assert doc['hook_fired'] == 2
|
assert doc['hook_fired'] == 2
|
||||||
|
|
||||||
|
|
||||||
def test_get_update_queue(self):
|
def test_get_update_queue(self):
|
||||||
self.kernel.track_url('never-updates', 'fast-update')
|
self.kernel.track_url('never-updates', 'fast-update')
|
||||||
self.kernel.track_url('bad-uri', 'fast-update')
|
self.kernel.track_url('bad-uri', 'fast-update')
|
||||||
@ -229,15 +225,12 @@ class KernelTests(TestCase):
|
|||||||
queue = self.kernel.get_update_queue()
|
queue = self.kernel.get_update_queue()
|
||||||
assert len(queue) == 3
|
assert len(queue) == 3
|
||||||
|
|
||||||
|
|
||||||
def test_get_update_queue_size(self):
|
def test_get_update_queue_size(self):
|
||||||
self.kernel.track_url('a', 'fast-update')
|
self.kernel.track_url('a', 'fast-update')
|
||||||
self.kernel.track_url('b', 'fast-update')
|
self.kernel.track_url('b', 'fast-update')
|
||||||
self.kernel.track_url('c', 'fast-update')
|
self.kernel.track_url('c', 'fast-update')
|
||||||
|
|
||||||
a = self.kernel.db.tracked.find_one(dict(url='a'))
|
a = self.kernel.db.tracked.find_one(dict(url='a'))
|
||||||
b = self.kernel.db.tracked.find_one(dict(url='b'))
|
|
||||||
c = self.kernel.db.tracked.find_one(dict(url='c'))
|
|
||||||
|
|
||||||
# size should start at 3
|
# size should start at 3
|
||||||
assert self.kernel.get_update_queue_size() == 3
|
assert self.kernel.get_update_queue_size() == 3
|
||||||
|
@ -3,6 +3,7 @@ from oyster.storage.s3 import S3Storage
|
|||||||
from oyster.storage.gridfs import GridFSStorage
|
from oyster.storage.gridfs import GridFSStorage
|
||||||
from oyster.storage.dummy import DummyStorage
|
from oyster.storage.dummy import DummyStorage
|
||||||
|
|
||||||
|
|
||||||
def _simple_storage_test(StorageCls):
|
def _simple_storage_test(StorageCls):
|
||||||
kernel = Kernel(mongo_db='oyster_test')
|
kernel = Kernel(mongo_db='oyster_test')
|
||||||
storage = StorageCls(kernel)
|
storage = StorageCls(kernel)
|
||||||
|
@ -44,6 +44,7 @@ def api_wrapper(template=None):
|
|||||||
|
|
||||||
app = flask.Flask('oyster')
|
app = flask.Flask('oyster')
|
||||||
|
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
@api_wrapper('index.html')
|
@api_wrapper('index.html')
|
||||||
def index():
|
def index():
|
||||||
|
Loading…
Reference in New Issue
Block a user