change how duplicates work
This commit is contained in:
parent
cca54b6447
commit
5124fe4eab
@ -91,24 +91,37 @@ class Kernel(object):
|
|||||||
any keyword args will be added to the document's metadata
|
any keyword args will be added to the document's metadata
|
||||||
"""
|
"""
|
||||||
if doc_class not in self.doc_classes:
|
if doc_class not in self.doc_classes:
|
||||||
raise ValueError('unregistered doc_class %s' % doc_class)
|
error = 'unregistered doc_class %s' % doc_class
|
||||||
|
self.log('track', url=url, error=error)
|
||||||
|
raise ValueError(error)
|
||||||
|
|
||||||
|
# try and find an existing version of this document
|
||||||
|
tracked = None
|
||||||
|
|
||||||
|
if id:
|
||||||
|
tracked = self.db.tracked.find_one({'_id': id})
|
||||||
|
|
||||||
|
if not tracked:
|
||||||
tracked = self.db.tracked.find_one({'url': url})
|
tracked = self.db.tracked.find_one({'url': url})
|
||||||
|
|
||||||
# if data is already tracked and this is just a duplicate call
|
# if id exists, ensure that URL and doc_class haven't changed
|
||||||
# return the original object
|
# then return existing data (possibly with refreshed metadata)
|
||||||
if tracked:
|
if tracked:
|
||||||
# only check id if id was passed in
|
if (tracked['url'] == url and
|
||||||
id_matches = (tracked['_id'] == id) if id else True
|
tracked['doc_class'] == doc_class):
|
||||||
if (tracked['metadata'] == kwargs and
|
if kwargs != tracked['metadata']:
|
||||||
tracked['doc_class'] == doc_class and
|
tracked['metadata'] = kwargs
|
||||||
id_matches):
|
self.db.tracked.save(tracked, safe=True)
|
||||||
return tracked['_id']
|
return tracked['_id']
|
||||||
else:
|
else:
|
||||||
self.log('track', url=url, error='tracking conflict')
|
# id existed but with different URL
|
||||||
raise ValueError('%s is already tracked with different '
|
error = ('%s already exists with different data (tracked: '
|
||||||
'metadata: (tracked: %r) (new: %r)' %
|
'%s, %s) (new: %s, %s)' % (tracked['_id'],
|
||||||
(url, tracked['metadata'], kwargs))
|
tracked['url'],
|
||||||
|
tracked['doc_class'],
|
||||||
|
url, doc_class))
|
||||||
|
self.log('track', url=url, error=error)
|
||||||
|
raise ValueError(error)
|
||||||
|
|
||||||
self.log('track', url=url)
|
self.log('track', url=url)
|
||||||
|
|
||||||
@ -117,7 +130,7 @@ class Kernel(object):
|
|||||||
versions=[], metadata=kwargs)
|
versions=[], metadata=kwargs)
|
||||||
if id:
|
if id:
|
||||||
newdoc['_id'] = id
|
newdoc['_id'] = id
|
||||||
return self.db.tracked.insert(newdoc)
|
return self.db.tracked.insert(newdoc, safe=True)
|
||||||
|
|
||||||
def md5_versioning(self, olddata, newdata):
|
def md5_versioning(self, olddata, newdata):
|
||||||
""" return True if md5 changed or if file is new """
|
""" return True if md5 changed or if file is new """
|
||||||
|
@ -8,7 +8,13 @@ class S3Storage(object):
|
|||||||
|
|
||||||
def __init__(self, kernel):
|
def __init__(self, kernel):
|
||||||
self.s3conn = boto.connect_s3(settings.AWS_KEY, settings.AWS_SECRET)
|
self.s3conn = boto.connect_s3(settings.AWS_KEY, settings.AWS_SECRET)
|
||||||
self.bucket = self.s3conn.create_bucket(settings.AWS_BUCKET)
|
self._bucket = False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def bucket(self):
|
||||||
|
if not self._bucket:
|
||||||
|
self._bucket = self.s3conn.create_bucket(settings.AWS_BUCKET)
|
||||||
|
return self._bucket
|
||||||
|
|
||||||
def put(self, tracked_doc, data, content_type):
|
def put(self, tracked_doc, data, content_type):
|
||||||
""" upload the document to S3 """
|
""" upload the document to S3 """
|
||||||
|
@ -83,28 +83,27 @@ class KernelTests(TestCase):
|
|||||||
id2 = self.kernel.track_url('http://example.com', 'default', pi=3)
|
id2 = self.kernel.track_url('http://example.com', 'default', pi=3)
|
||||||
assert id1 == id2
|
assert id1 == id2
|
||||||
|
|
||||||
# test setting id
|
# test manually set id
|
||||||
out = self.kernel.track_url('http://example.com/2', 'default',
|
out = self.kernel.track_url('http://example.com/2', 'default',
|
||||||
'fixed-id')
|
'fixed-id')
|
||||||
assert out == 'fixed-id'
|
assert out == 'fixed-id'
|
||||||
|
|
||||||
# can't track same URL twice with different id
|
# can't pass track same id twice with different url
|
||||||
assert_raises(ValueError, self.kernel.track_url, 'http://example.com',
|
self.kernel.db.logs.drop()
|
||||||
'default', 'hard-coded-id')
|
assert_raises(ValueError, self.kernel.track_url,
|
||||||
# logged error
|
'http://example.com/3', 'default', 'fixed-id')
|
||||||
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
assert 'already exists' in self.kernel.db.logs.find_one()['error']
|
||||||
|
|
||||||
# ... with different metadata
|
# ... or different doc class
|
||||||
assert_raises(ValueError, self.kernel.track_url, 'http://example.com',
|
self.kernel.db.logs.drop()
|
||||||
'default')
|
assert_raises(ValueError, self.kernel.track_url,
|
||||||
# logged error
|
'http://example.com/2', 'change-hook', 'fixed-id')
|
||||||
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
assert 'already exists' in self.kernel.db.logs.find_one()['error']
|
||||||
|
|
||||||
# ... different doc class
|
# different metadata is ok, but it should be updated
|
||||||
assert_raises(ValueError, self.kernel.track_url, 'http://example.com',
|
self.kernel.track_url('http://example.com/2', 'default', 'fixed-id',
|
||||||
'special-doc-class', pi=3)
|
pi=3)
|
||||||
# logged error
|
self.kernel.db.tracked.find_one({'_id': 'fixed-id'})['metadata']['pi'] == 3
|
||||||
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
|
|
||||||
|
|
||||||
def test_no_update(self):
|
def test_no_update(self):
|
||||||
# update
|
# update
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
from nose.plugins.skip import SkipTest
|
||||||
|
|
||||||
|
from oyster.conf import settings
|
||||||
from oyster.core import Kernel
|
from oyster.core import Kernel
|
||||||
from oyster.storage.s3 import S3Storage
|
from oyster.storage.s3 import S3Storage
|
||||||
from oyster.storage.gridfs import GridFSStorage
|
from oyster.storage.gridfs import GridFSStorage
|
||||||
@ -21,6 +24,8 @@ def _simple_storage_test(StorageCls):
|
|||||||
|
|
||||||
|
|
||||||
def test_s3():
|
def test_s3():
|
||||||
|
#if not hasattr(settings, 'AWS_BUCKET'):
|
||||||
|
# raise SkipTest('S3 not configured')
|
||||||
_simple_storage_test(S3Storage)
|
_simple_storage_test(S3Storage)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user