switch to Kernel
This commit is contained in:
parent
f6a85c119b
commit
e73ec6ab6b
@ -9,27 +9,16 @@ import gridfs
|
|||||||
import scrapelib
|
import scrapelib
|
||||||
|
|
||||||
|
|
||||||
def get_configured_connection():
|
class Kernel(object):
|
||||||
""" factory, gets a connection configured with oyster.conf.settings """
|
|
||||||
from oyster.conf import settings
|
|
||||||
return Connection(mongo_host=settings.MONGO_HOST,
|
|
||||||
mongo_port=settings.MONGO_PORT,
|
|
||||||
mongo_db=settings.MONGO_DATABASE,
|
|
||||||
mongo_log_maxsize=settings.MONGO_LOG_MAXSIZE,
|
|
||||||
user_agent=settings.USER_AGENT,
|
|
||||||
rpm=settings.REQUESTS_PER_MINUTE,
|
|
||||||
timeout=settings.REQUEST_TIMEOUT,
|
|
||||||
retry_attempts=settings.RETRY_ATTEMPTS,
|
|
||||||
retry_wait_minutes=settings.RETRY_WAIT_MINUTES)
|
|
||||||
|
|
||||||
|
|
||||||
class Connection(object):
|
|
||||||
""" oyster's workhorse, handles tracking """
|
""" oyster's workhorse, handles tracking """
|
||||||
|
|
||||||
def __init__(self, mongo_host='localhost', mongo_port=27017,
|
def __init__(self, mongo_host='localhost', mongo_port=27017,
|
||||||
mongo_db='oyster', mongo_log_maxsize=100000000,
|
mongo_db='oyster', mongo_log_maxsize=100000000,
|
||||||
user_agent='oyster', rpm=60, timeout=300,
|
user_agent='oyster', rpm=60, timeout=300,
|
||||||
retry_attempts=3, retry_wait_minutes=60):
|
retry_attempts=3, retry_wait_minutes=60):
|
||||||
|
"""
|
||||||
|
configurable for ease of testing, only one should be instantiated
|
||||||
|
"""
|
||||||
|
|
||||||
# set up a capped log if it doesn't exist
|
# set up a capped log if it doesn't exist
|
||||||
self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
|
self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
|
||||||
@ -83,6 +72,12 @@ class Connection(object):
|
|||||||
|
|
||||||
url
|
url
|
||||||
URL to start tracking
|
URL to start tracking
|
||||||
|
versioning
|
||||||
|
currently only valid value is "md5"
|
||||||
|
update_mins
|
||||||
|
minutes between automatic updates, default is 1440 (1 day)
|
||||||
|
**kwargs
|
||||||
|
any keyword args will be added to the document's metadata
|
||||||
"""
|
"""
|
||||||
tracked = self.db.tracked.find_one({'url': url})
|
tracked = self.db.tracked.find_one({'url': url})
|
||||||
|
|
||||||
@ -116,6 +111,18 @@ class Connection(object):
|
|||||||
|
|
||||||
|
|
||||||
def update(self, doc):
|
def update(self, doc):
|
||||||
|
"""
|
||||||
|
perform update upon a given document
|
||||||
|
|
||||||
|
:param:`doc` must be a document from the `tracked` collection
|
||||||
|
|
||||||
|
* download latest document
|
||||||
|
* check if document has changed using versioning func
|
||||||
|
* if a change has occurred save the file to GridFS
|
||||||
|
* if error occured, log & keep track of how many errors in a row
|
||||||
|
* update last_update/next_update timestamp
|
||||||
|
"""
|
||||||
|
|
||||||
do_put = True
|
do_put = True
|
||||||
error = False
|
error = False
|
||||||
|
|
||||||
@ -166,6 +173,9 @@ class Connection(object):
|
|||||||
|
|
||||||
|
|
||||||
def get_all_versions(self, url):
|
def get_all_versions(self, url):
|
||||||
|
"""
|
||||||
|
get all versions stored for a given URL
|
||||||
|
"""
|
||||||
versions = []
|
versions = []
|
||||||
n = 0
|
n = 0
|
||||||
while True:
|
while True:
|
||||||
@ -178,10 +188,23 @@ class Connection(object):
|
|||||||
|
|
||||||
|
|
||||||
def get_version(self, url, n=-1):
|
def get_version(self, url, n=-1):
|
||||||
|
"""
|
||||||
|
get a specific version of a file
|
||||||
|
|
||||||
|
defaults to getting latest version
|
||||||
|
"""
|
||||||
return self.fs.get_version(url, n)
|
return self.fs.get_version(url, n)
|
||||||
|
|
||||||
|
|
||||||
def get_update_queue(self):
|
def get_update_queue(self):
|
||||||
|
"""
|
||||||
|
Get a list of what needs to be updated.
|
||||||
|
|
||||||
|
Documents that have never been updated take priority, followed by
|
||||||
|
documents that are simply stale. Within these two categories results
|
||||||
|
are sorted in semirandom order to decrease odds of piling on one
|
||||||
|
server.
|
||||||
|
"""
|
||||||
# results are always sorted by random to avoid piling on single server
|
# results are always sorted by random to avoid piling on single server
|
||||||
|
|
||||||
# first we try to update anything that we've never retrieved
|
# first we try to update anything that we've never retrieved
|
||||||
@ -198,7 +221,28 @@ class Connection(object):
|
|||||||
|
|
||||||
|
|
||||||
def get_update_queue_size(self):
|
def get_update_queue_size(self):
|
||||||
|
"""
|
||||||
|
Get the size of the update queue, this should match
|
||||||
|
``len(self.get_update_queue())``, but is computed more efficiently.
|
||||||
|
"""
|
||||||
new = self.db.tracked.find({'next_update': {'$exists': False}}).count()
|
new = self.db.tracked.find({'next_update': {'$exists': False}}).count()
|
||||||
next = self.db.tracked.find({'next_update':
|
next = self.db.tracked.find({'next_update':
|
||||||
{'$lt': datetime.datetime.utcnow()}}).count()
|
{'$lt': datetime.datetime.utcnow()}}).count()
|
||||||
return new+next
|
return new+next
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _get_configured_kernel():
|
||||||
|
""" factory, gets a connection configured with oyster.conf.settings """
|
||||||
|
from oyster.conf import settings
|
||||||
|
return Kernel(mongo_host=settings.MONGO_HOST,
|
||||||
|
mongo_port=settings.MONGO_PORT,
|
||||||
|
mongo_db=settings.MONGO_DATABASE,
|
||||||
|
mongo_log_maxsize=settings.MONGO_LOG_MAXSIZE,
|
||||||
|
user_agent=settings.USER_AGENT,
|
||||||
|
rpm=settings.REQUESTS_PER_MINUTE,
|
||||||
|
timeout=settings.REQUEST_TIMEOUT,
|
||||||
|
retry_attempts=settings.RETRY_ATTEMPTS,
|
||||||
|
retry_wait_minutes=settings.RETRY_WAIT_MINUTES)
|
||||||
|
|
||||||
|
kernel = _get_configured_kernel()
|
@ -4,7 +4,7 @@ from celery.execute import send_task
|
|||||||
from pymongo.objectid import ObjectId
|
from pymongo.objectid import ObjectId
|
||||||
|
|
||||||
from oyster.conf import settings
|
from oyster.conf import settings
|
||||||
from oyster.connection import get_configured_connection
|
from oyster.core import kernel
|
||||||
|
|
||||||
|
|
||||||
class UpdateTask(Task):
|
class UpdateTask(Task):
|
||||||
@ -12,17 +12,12 @@ class UpdateTask(Task):
|
|||||||
# results go straight to database
|
# results go straight to database
|
||||||
ignore_result = True
|
ignore_result = True
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
# one connection per process
|
|
||||||
self.conn = get_configured_connection()
|
|
||||||
|
|
||||||
|
|
||||||
def run(self, doc_id):
|
def run(self, doc_id):
|
||||||
doc = self.conn.db.tracked.find_one({'_id': doc_id})
|
doc = kernel.db.tracked.find_one({'_id': doc_id})
|
||||||
self.conn.update(doc)
|
kernel.update(doc)
|
||||||
for hook in doc.get('post_update_hooks', []):
|
for hook in doc.get('post_update_hooks', []):
|
||||||
send_task(hook, (doc_id,))
|
send_task(hook, (doc_id,))
|
||||||
self.conn.db.status.update({}, {'$inc': {'update_queue': -1}})
|
kernel.db.status.update({}, {'$inc': {'update_queue': -1}})
|
||||||
|
|
||||||
|
|
||||||
class UpdateTaskScheduler(PeriodicTask):
|
class UpdateTaskScheduler(PeriodicTask):
|
||||||
@ -30,19 +25,18 @@ class UpdateTaskScheduler(PeriodicTask):
|
|||||||
|
|
||||||
# 60s tick
|
# 60s tick
|
||||||
run_every = 60
|
run_every = 60
|
||||||
conn = get_configured_connection()
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
# if the update queue isn't empty, wait to add more
|
# if the update queue isn't empty, wait to add more
|
||||||
# (currently the only way we avoid duplicates)
|
# (currently the only way we avoid duplicates)
|
||||||
# alternate option would be to set a _queued flag on documents
|
# alternate option would be to set a _queued flag on documents
|
||||||
if self.conn.db.status.find_one()['update_queue']:
|
if kernel.db.status.find_one()['update_queue']:
|
||||||
return
|
return
|
||||||
|
|
||||||
next_set = self.conn.get_update_queue()
|
next_set = kernel.get_update_queue()
|
||||||
for doc in next_set:
|
for doc in next_set:
|
||||||
UpdateTask.delay(doc['_id'])
|
UpdateTask.delay(doc['_id'])
|
||||||
self.conn.db.status.update({}, {'$inc': {'update_queue': 1}})
|
kernel.status.update({}, {'$inc': {'update_queue': 1}})
|
||||||
|
|
||||||
|
|
||||||
class ExternalStoreTask(Task):
|
class ExternalStoreTask(Task):
|
||||||
@ -60,21 +54,17 @@ class ExternalStoreTask(Task):
|
|||||||
# used as a base class
|
# used as a base class
|
||||||
abstract = True
|
abstract = True
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
# one connection per process
|
|
||||||
self.conn = get_configured_connection()
|
|
||||||
|
|
||||||
def run(self, doc_id, extract_text=lambda x: x):
|
def run(self, doc_id, extract_text=lambda x: x):
|
||||||
# get the document
|
# get the document
|
||||||
doc = self.conn.db.tracked.find_one({'_id': ObjectId(doc_id)})
|
doc = kernel.db.tracked.find_one({'_id': ObjectId(doc_id)})
|
||||||
filedata = self.conn.get_version(doc['url']).read()
|
filedata = kernel.get_version(doc['url']).read()
|
||||||
text = extract_text(filedata, doc['metadata'])
|
text = extract_text(filedata, doc['metadata'])
|
||||||
|
|
||||||
# put the document into the data store
|
# put the document into the data store
|
||||||
result = self.upload_document(doc_id, text, doc['metadata'])
|
result = self.upload_document(doc_id, text, doc['metadata'])
|
||||||
|
|
||||||
doc[self.external_store + '_id'] = result
|
doc[self.external_store + '_id'] = result
|
||||||
self.conn.db.tracked.save(doc, safe=True)
|
kernel.db.tracked.save(doc, safe=True)
|
||||||
|
|
||||||
|
|
||||||
def upload_document(self, doc_id, filedata, metadata):
|
def upload_document(self, doc_id, filedata, metadata):
|
||||||
|
@ -1,190 +0,0 @@
|
|||||||
import time
|
|
||||||
import datetime
|
|
||||||
from unittest import TestCase
|
|
||||||
|
|
||||||
from nose.tools import assert_raises
|
|
||||||
import pymongo
|
|
||||||
|
|
||||||
from oyster.connection import Connection
|
|
||||||
|
|
||||||
|
|
||||||
class ConnectionTests(TestCase):
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
self.conn = Connection(mongo_db='oyster_test', retry_wait_minutes=1/60.)
|
|
||||||
self.conn._wipe()
|
|
||||||
|
|
||||||
|
|
||||||
def test_constructor(self):
|
|
||||||
c = Connection('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000,
|
|
||||||
user_agent='test-ua', rpm=30, timeout=60,
|
|
||||||
retry_attempts=7, retry_wait_minutes=8)
|
|
||||||
assert c.db.connection.host == '127.0.0.1'
|
|
||||||
assert c.db.connection.port == 27017
|
|
||||||
assert c.db.logs.options()['capped'] == True
|
|
||||||
assert c.db.logs.options()['size'] == 5000
|
|
||||||
assert c.retry_wait_minutes == 8
|
|
||||||
# TODO: test retry_attempts
|
|
||||||
assert c.scraper.user_agent == 'test-ua'
|
|
||||||
assert c.scraper.requests_per_minute == 30
|
|
||||||
assert c.scraper.timeout == 60
|
|
||||||
|
|
||||||
|
|
||||||
def test_log(self):
|
|
||||||
self.conn.log('action1', 'http://example.com')
|
|
||||||
self.conn.log('action2', 'http://test.com', error=True, pi=3)
|
|
||||||
assert self.conn.db.logs.count() == 2
|
|
||||||
x = self.conn.db.logs.find_one({'error': True})
|
|
||||||
assert x['action'] == 'action2'
|
|
||||||
assert x['url'] == 'http://test.com'
|
|
||||||
assert x['pi'] == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_track_url(self):
|
|
||||||
# basic insert
|
|
||||||
id1 = self.conn.track_url('http://example.com', update_mins=30, pi=3)
|
|
||||||
obj = self.conn.db.tracked.find_one()
|
|
||||||
assert '_random' in obj
|
|
||||||
assert obj['update_mins'] == 30
|
|
||||||
assert obj['metadata'] == {'pi': 3}
|
|
||||||
|
|
||||||
# logging
|
|
||||||
log = self.conn.db.logs.find_one()
|
|
||||||
assert log['action'] == 'track'
|
|
||||||
assert log['url'] == 'http://example.com'
|
|
||||||
|
|
||||||
# track same url again with same metadata returns id
|
|
||||||
id2 = self.conn.track_url('http://example.com', update_mins=30, pi=3)
|
|
||||||
assert id1 == id2
|
|
||||||
|
|
||||||
# can't track same URL twice with different metadata
|
|
||||||
assert_raises(ValueError, self.conn.track_url, 'http://example.com')
|
|
||||||
|
|
||||||
# logged error
|
|
||||||
assert self.conn.db.logs.find_one({'error': 'tracking conflict'})
|
|
||||||
|
|
||||||
|
|
||||||
def test_md5_versioning(self):
|
|
||||||
doc = {'url': 'hello.txt'}
|
|
||||||
self.conn.fs.put('hello!', filename='hello.txt')
|
|
||||||
assert not self.conn.md5_versioning(doc, 'hello!')
|
|
||||||
assert self.conn.md5_versioning(doc, 'hey!')
|
|
||||||
|
|
||||||
|
|
||||||
def test_update(self):
|
|
||||||
# get a single document tracked
|
|
||||||
self.conn.track_url('http://example.com', update_mins=60, pi=3)
|
|
||||||
obj = self.conn.db.tracked.find_one()
|
|
||||||
self.conn.update(obj)
|
|
||||||
|
|
||||||
# check that metadata has been updated
|
|
||||||
newobj = self.conn.db.tracked.find_one()
|
|
||||||
assert (newobj['last_update'] +
|
|
||||||
datetime.timedelta(minutes=newobj['update_mins']) ==
|
|
||||||
newobj['next_update'])
|
|
||||||
first_update = newobj['last_update']
|
|
||||||
assert newobj['consecutive_errors'] == 0
|
|
||||||
|
|
||||||
# check that document exists in database
|
|
||||||
doc = self.conn.fs.get_last_version()
|
|
||||||
assert doc.filename == 'http://example.com'
|
|
||||||
assert doc.content_type.startswith('text/html')
|
|
||||||
assert doc.pi == 3
|
|
||||||
|
|
||||||
# check logs
|
|
||||||
assert self.conn.db.logs.find({'action': 'update'}).count() == 1
|
|
||||||
|
|
||||||
# and do an update..
|
|
||||||
self.conn.update(obj)
|
|
||||||
|
|
||||||
# hopefully example.com hasn't changed, this tests that md5 worked
|
|
||||||
assert self.conn.db.fs.files.count() == 1
|
|
||||||
|
|
||||||
# check that appropriate metadata updated
|
|
||||||
newobj = self.conn.db.tracked.find_one()
|
|
||||||
assert first_update < newobj['last_update']
|
|
||||||
|
|
||||||
# check that logs updated
|
|
||||||
assert self.conn.db.logs.find({'action': 'update'}).count() == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_failure(self):
|
|
||||||
# track a non-existent URL
|
|
||||||
self.conn.track_url('http://not_a_url')
|
|
||||||
obj = self.conn.db.tracked.find_one()
|
|
||||||
self.conn.update(obj)
|
|
||||||
|
|
||||||
obj = self.conn.db.tracked.find_one()
|
|
||||||
assert obj['consecutive_errors'] == 1
|
|
||||||
|
|
||||||
# we should have logged an error too
|
|
||||||
assert self.conn.db.logs.find({'action': 'update',
|
|
||||||
'error': {'$ne': False}}).count() == 1
|
|
||||||
|
|
||||||
# update again
|
|
||||||
self.conn.update(obj)
|
|
||||||
|
|
||||||
obj = self.conn.db.tracked.find_one()
|
|
||||||
assert obj['consecutive_errors'] == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_all_versions(self):
|
|
||||||
random_url = 'http://en.wikipedia.org/wiki/Special:Random'
|
|
||||||
self.conn.track_url(random_url)
|
|
||||||
obj = self.conn.db.tracked.find_one()
|
|
||||||
self.conn.update(obj)
|
|
||||||
|
|
||||||
versions = self.conn.get_all_versions(random_url)
|
|
||||||
assert versions[0].filename == random_url
|
|
||||||
|
|
||||||
self.conn.update(obj)
|
|
||||||
assert len(self.conn.get_all_versions(random_url)) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_update_queue(self):
|
|
||||||
self.conn.track_url('never-updates', update_mins=0.01)
|
|
||||||
self.conn.track_url('bad-uri', update_mins=0.01)
|
|
||||||
self.conn.track_url('http://example.com', update_mins=0.01)
|
|
||||||
|
|
||||||
never = self.conn.db.tracked.find_one(dict(url='never-updates'))
|
|
||||||
bad = self.conn.db.tracked.find_one(dict(url='bad-uri'))
|
|
||||||
good = self.conn.db.tracked.find_one(dict(url='http://example.com'))
|
|
||||||
|
|
||||||
# 3 in queue, ordered by random
|
|
||||||
queue = self.conn.get_update_queue()
|
|
||||||
assert len(queue) == 3
|
|
||||||
assert queue[0]['_random'] < queue[1]['_random'] < queue[2]['_random']
|
|
||||||
|
|
||||||
# try and update bad & good
|
|
||||||
self.conn.update(bad)
|
|
||||||
self.conn.update(good)
|
|
||||||
|
|
||||||
# queue should only have never in it
|
|
||||||
queue = self.conn.get_update_queue()
|
|
||||||
assert queue[0]['_id'] == never['_id']
|
|
||||||
|
|
||||||
# wait for time to pass so queue should be full
|
|
||||||
time.sleep(1)
|
|
||||||
queue = self.conn.get_update_queue()
|
|
||||||
assert len(queue) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_update_queue_size(self):
|
|
||||||
self.conn.track_url('a', update_mins=0.01)
|
|
||||||
self.conn.track_url('b', update_mins=0.01)
|
|
||||||
self.conn.track_url('c', update_mins=0.01)
|
|
||||||
|
|
||||||
a = self.conn.db.tracked.find_one(dict(url='a'))
|
|
||||||
b = self.conn.db.tracked.find_one(dict(url='b'))
|
|
||||||
c = self.conn.db.tracked.find_one(dict(url='c'))
|
|
||||||
|
|
||||||
# size should start at 3
|
|
||||||
assert self.conn.get_update_queue_size() == 3
|
|
||||||
|
|
||||||
# goes down one
|
|
||||||
self.conn.update(a)
|
|
||||||
assert self.conn.get_update_queue_size() == 2
|
|
||||||
|
|
||||||
# wait for it to go back to 3
|
|
||||||
time.sleep(1)
|
|
||||||
assert self.conn.get_update_queue_size() == 3
|
|
190
oyster/tests/test_kernel.py
Normal file
190
oyster/tests/test_kernel.py
Normal file
@ -0,0 +1,190 @@
|
|||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
from unittest import TestCase
|
||||||
|
|
||||||
|
from nose.tools import assert_raises
|
||||||
|
import pymongo
|
||||||
|
|
||||||
|
from oyster.core import Kernel
|
||||||
|
|
||||||
|
|
||||||
|
class KernelTests(TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.kernel = Kernel(mongo_db='oyster_test', retry_wait_minutes=1/60.)
|
||||||
|
self.kernel._wipe()
|
||||||
|
|
||||||
|
|
||||||
|
def test_constructor(self):
|
||||||
|
c = Kernel('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000,
|
||||||
|
user_agent='test-ua', rpm=30, timeout=60,
|
||||||
|
retry_attempts=7, retry_wait_minutes=8)
|
||||||
|
assert c.db.connection.host == '127.0.0.1'
|
||||||
|
assert c.db.connection.port == 27017
|
||||||
|
assert c.db.logs.options()['capped'] == True
|
||||||
|
assert c.db.logs.options()['size'] == 5000
|
||||||
|
assert c.retry_wait_minutes == 8
|
||||||
|
# TODO: test retry_attempts
|
||||||
|
assert c.scraper.user_agent == 'test-ua'
|
||||||
|
assert c.scraper.requests_per_minute == 30
|
||||||
|
assert c.scraper.timeout == 60
|
||||||
|
|
||||||
|
|
||||||
|
def test_log(self):
|
||||||
|
self.kernel.log('action1', 'http://example.com')
|
||||||
|
self.kernel.log('action2', 'http://test.com', error=True, pi=3)
|
||||||
|
assert self.kernel.db.logs.count() == 2
|
||||||
|
x = self.kernel.db.logs.find_one({'error': True})
|
||||||
|
assert x['action'] == 'action2'
|
||||||
|
assert x['url'] == 'http://test.com'
|
||||||
|
assert x['pi'] == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_track_url(self):
|
||||||
|
# basic insert
|
||||||
|
id1 = self.kernel.track_url('http://example.com', update_mins=30, pi=3)
|
||||||
|
obj = self.kernel.db.tracked.find_one()
|
||||||
|
assert '_random' in obj
|
||||||
|
assert obj['update_mins'] == 30
|
||||||
|
assert obj['metadata'] == {'pi': 3}
|
||||||
|
|
||||||
|
# logging
|
||||||
|
log = self.kernel.db.logs.find_one()
|
||||||
|
assert log['action'] == 'track'
|
||||||
|
assert log['url'] == 'http://example.com'
|
||||||
|
|
||||||
|
# track same url again with same metadata returns id
|
||||||
|
id2 = self.kernel.track_url('http://example.com', update_mins=30, pi=3)
|
||||||
|
assert id1 == id2
|
||||||
|
|
||||||
|
# can't track same URL twice with different metadata
|
||||||
|
assert_raises(ValueError, self.kernel.track_url, 'http://example.com')
|
||||||
|
|
||||||
|
# logged error
|
||||||
|
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
||||||
|
|
||||||
|
|
||||||
|
def test_md5_versioning(self):
|
||||||
|
doc = {'url': 'hello.txt'}
|
||||||
|
self.kernel.fs.put('hello!', filename='hello.txt')
|
||||||
|
assert not self.kernel.md5_versioning(doc, 'hello!')
|
||||||
|
assert self.kernel.md5_versioning(doc, 'hey!')
|
||||||
|
|
||||||
|
|
||||||
|
def test_update(self):
|
||||||
|
# get a single document tracked
|
||||||
|
self.kernel.track_url('http://example.com', update_mins=60, pi=3)
|
||||||
|
obj = self.kernel.db.tracked.find_one()
|
||||||
|
self.kernel.update(obj)
|
||||||
|
|
||||||
|
# check that metadata has been updated
|
||||||
|
newobj = self.kernel.db.tracked.find_one()
|
||||||
|
assert (newobj['last_update'] +
|
||||||
|
datetime.timedelta(minutes=newobj['update_mins']) ==
|
||||||
|
newobj['next_update'])
|
||||||
|
first_update = newobj['last_update']
|
||||||
|
assert newobj['consecutive_errors'] == 0
|
||||||
|
|
||||||
|
# check that document exists in database
|
||||||
|
doc = self.kernel.fs.get_last_version()
|
||||||
|
assert doc.filename == 'http://example.com'
|
||||||
|
assert doc.content_type.startswith('text/html')
|
||||||
|
assert doc.pi == 3
|
||||||
|
|
||||||
|
# check logs
|
||||||
|
assert self.kernel.db.logs.find({'action': 'update'}).count() == 1
|
||||||
|
|
||||||
|
# and do an update..
|
||||||
|
self.kernel.update(obj)
|
||||||
|
|
||||||
|
# hopefully example.com hasn't changed, this tests that md5 worked
|
||||||
|
assert self.kernel.db.fs.files.count() == 1
|
||||||
|
|
||||||
|
# check that appropriate metadata updated
|
||||||
|
newobj = self.kernel.db.tracked.find_one()
|
||||||
|
assert first_update < newobj['last_update']
|
||||||
|
|
||||||
|
# check that logs updated
|
||||||
|
assert self.kernel.db.logs.find({'action': 'update'}).count() == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_failure(self):
|
||||||
|
# track a non-existent URL
|
||||||
|
self.kernel.track_url('http://not_a_url')
|
||||||
|
obj = self.kernel.db.tracked.find_one()
|
||||||
|
self.kernel.update(obj)
|
||||||
|
|
||||||
|
obj = self.kernel.db.tracked.find_one()
|
||||||
|
assert obj['consecutive_errors'] == 1
|
||||||
|
|
||||||
|
# we should have logged an error too
|
||||||
|
assert self.kernel.db.logs.find({'action': 'update',
|
||||||
|
'error': {'$ne': False}}).count() == 1
|
||||||
|
|
||||||
|
# update again
|
||||||
|
self.kernel.update(obj)
|
||||||
|
|
||||||
|
obj = self.kernel.db.tracked.find_one()
|
||||||
|
assert obj['consecutive_errors'] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_versions(self):
|
||||||
|
random_url = 'http://en.wikipedia.org/wiki/Special:Random'
|
||||||
|
self.kernel.track_url(random_url)
|
||||||
|
obj = self.kernel.db.tracked.find_one()
|
||||||
|
self.kernel.update(obj)
|
||||||
|
|
||||||
|
versions = self.kernel.get_all_versions(random_url)
|
||||||
|
assert versions[0].filename == random_url
|
||||||
|
|
||||||
|
self.kernel.update(obj)
|
||||||
|
assert len(self.kernel.get_all_versions(random_url)) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_update_queue(self):
|
||||||
|
self.kernel.track_url('never-updates', update_mins=0.01)
|
||||||
|
self.kernel.track_url('bad-uri', update_mins=0.01)
|
||||||
|
self.kernel.track_url('http://example.com', update_mins=0.01)
|
||||||
|
|
||||||
|
never = self.kernel.db.tracked.find_one(dict(url='never-updates'))
|
||||||
|
bad = self.kernel.db.tracked.find_one(dict(url='bad-uri'))
|
||||||
|
good = self.kernel.db.tracked.find_one(dict(url='http://example.com'))
|
||||||
|
|
||||||
|
# 3 in queue, ordered by random
|
||||||
|
queue = self.kernel.get_update_queue()
|
||||||
|
assert len(queue) == 3
|
||||||
|
assert queue[0]['_random'] < queue[1]['_random'] < queue[2]['_random']
|
||||||
|
|
||||||
|
# try and update bad & good
|
||||||
|
self.kernel.update(bad)
|
||||||
|
self.kernel.update(good)
|
||||||
|
|
||||||
|
# queue should only have never in it
|
||||||
|
queue = self.kernel.get_update_queue()
|
||||||
|
assert queue[0]['_id'] == never['_id']
|
||||||
|
|
||||||
|
# wait for time to pass so queue should be full
|
||||||
|
time.sleep(1)
|
||||||
|
queue = self.kernel.get_update_queue()
|
||||||
|
assert len(queue) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_update_queue_size(self):
|
||||||
|
self.kernel.track_url('a', update_mins=0.01)
|
||||||
|
self.kernel.track_url('b', update_mins=0.01)
|
||||||
|
self.kernel.track_url('c', update_mins=0.01)
|
||||||
|
|
||||||
|
a = self.kernel.db.tracked.find_one(dict(url='a'))
|
||||||
|
b = self.kernel.db.tracked.find_one(dict(url='b'))
|
||||||
|
c = self.kernel.db.tracked.find_one(dict(url='c'))
|
||||||
|
|
||||||
|
# size should start at 3
|
||||||
|
assert self.kernel.get_update_queue_size() == 3
|
||||||
|
|
||||||
|
# goes down one
|
||||||
|
self.kernel.update(a)
|
||||||
|
assert self.kernel.get_update_queue_size() == 2
|
||||||
|
|
||||||
|
# wait for it to go back to 3
|
||||||
|
time.sleep(1)
|
||||||
|
assert self.kernel.get_update_queue_size() == 3
|
@ -7,7 +7,7 @@ import flask
|
|||||||
import pymongo.objectid
|
import pymongo.objectid
|
||||||
|
|
||||||
from oyster.conf import settings
|
from oyster.conf import settings
|
||||||
from oyster.connection import get_configured_connection
|
from oyster.core import kernel
|
||||||
|
|
||||||
|
|
||||||
class JSONEncoder(json.JSONEncoder):
|
class JSONEncoder(json.JSONEncoder):
|
||||||
@ -43,16 +43,14 @@ def api_wrapper(template=None):
|
|||||||
|
|
||||||
|
|
||||||
app = flask.Flask('oyster')
|
app = flask.Flask('oyster')
|
||||||
conn = get_configured_connection()
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/')
|
@app.route('/')
|
||||||
@api_wrapper('index.html')
|
@api_wrapper('index.html')
|
||||||
def index():
|
def index():
|
||||||
status = {
|
status = {
|
||||||
'tracking': conn.db.tracked.count(),
|
'tracking': kernel.db.tracked.count(),
|
||||||
'need_update': conn.get_update_queue_size(),
|
'need_update': kernel.get_update_queue_size(),
|
||||||
'logs': list(conn.db.logs.find().sort('$natural', -1).limit(20)),
|
'logs': list(kernel.db.logs.find().sort('$natural', -1).limit(20)),
|
||||||
'mongo_host': settings.MONGO_HOST,
|
'mongo_host': settings.MONGO_HOST,
|
||||||
}
|
}
|
||||||
return status
|
return status
|
||||||
@ -62,8 +60,8 @@ def index():
|
|||||||
@api_wrapper()
|
@api_wrapper()
|
||||||
def doc_list():
|
def doc_list():
|
||||||
status = {
|
status = {
|
||||||
'tracking': conn.db.tracked.count(),
|
'tracking': kernel.db.tracked.count(),
|
||||||
'need_update': conn.get_update_queue_size(),
|
'need_update': kernel.get_update_queue_size(),
|
||||||
}
|
}
|
||||||
return status
|
return status
|
||||||
|
|
||||||
@ -75,7 +73,7 @@ def log_view():
|
|||||||
size = 100
|
size = 100
|
||||||
prev_offset = max(offset - size, 0)
|
prev_offset = max(offset - size, 0)
|
||||||
next_offset = offset + size
|
next_offset = offset + size
|
||||||
logs = conn.db.logs.find().sort('$natural', -1).skip(offset).limit(size)
|
logs = kernel.db.logs.find().sort('$natural', -1).skip(offset).limit(size)
|
||||||
return dict(logs=list(logs), prev_offset=prev_offset,
|
return dict(logs=list(logs), prev_offset=prev_offset,
|
||||||
next_offset=next_offset, offset=offset)
|
next_offset=next_offset, offset=offset)
|
||||||
|
|
||||||
@ -83,14 +81,14 @@ def log_view():
|
|||||||
@app.route('/tracked/')
|
@app.route('/tracked/')
|
||||||
@api_wrapper()
|
@api_wrapper()
|
||||||
def tracked():
|
def tracked():
|
||||||
tracked = list(conn.db.tracked.find())
|
tracked = list(kernel.db.tracked.find())
|
||||||
return json.dumps(tracked, cls=JSONEncoder)
|
return json.dumps(tracked, cls=JSONEncoder)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/tracked/<path:url>')
|
@app.route('/tracked/<path:url>')
|
||||||
def tracked_view(url):
|
def tracked_view(url):
|
||||||
url = _path_fixer(url)
|
url = _path_fixer(url)
|
||||||
doc = conn.db.tracked.find_one({'url': url})
|
doc = kernel.db.tracked.find_one({'url': url})
|
||||||
return json.dumps(doc, cls=JSONEncoder)
|
return json.dumps(doc, cls=JSONEncoder)
|
||||||
|
|
||||||
|
|
||||||
@ -99,7 +97,7 @@ def show_doc(url, version):
|
|||||||
url = _path_fixer(url)
|
url = _path_fixer(url)
|
||||||
if version == 'latest':
|
if version == 'latest':
|
||||||
version = -1
|
version = -1
|
||||||
doc = conn.get_version(url, version)
|
doc = kernel.get_version(url, version)
|
||||||
resp = flask.make_response(doc.read())
|
resp = flask.make_response(doc.read())
|
||||||
resp.headers['content-type'] = doc.content_type
|
resp.headers['content-type'] = doc.content_type
|
||||||
return resp
|
return resp
|
||||||
|
Loading…
Reference in New Issue
Block a user