change how duplicates work

This commit is contained in:
James Turk 2012-03-21 12:44:32 -04:00
parent cca54b6447
commit 5124fe4eab
4 changed files with 54 additions and 31 deletions

View File

@ -91,24 +91,37 @@ class Kernel(object):
any keyword args will be added to the document's metadata any keyword args will be added to the document's metadata
""" """
if doc_class not in self.doc_classes: if doc_class not in self.doc_classes:
raise ValueError('unregistered doc_class %s' % doc_class) error = 'unregistered doc_class %s' % doc_class
self.log('track', url=url, error=error)
raise ValueError(error)
# try and find an existing version of this document
tracked = None
if id:
tracked = self.db.tracked.find_one({'_id': id})
if not tracked:
tracked = self.db.tracked.find_one({'url': url}) tracked = self.db.tracked.find_one({'url': url})
# if data is already tracked and this is just a duplicate call # if id exists, ensure that URL and doc_class haven't changed
# return the original object # then return existing data (possibly with refreshed metadata)
if tracked: if tracked:
# only check id if id was passed in if (tracked['url'] == url and
id_matches = (tracked['_id'] == id) if id else True tracked['doc_class'] == doc_class):
if (tracked['metadata'] == kwargs and if kwargs != tracked['metadata']:
tracked['doc_class'] == doc_class and tracked['metadata'] = kwargs
id_matches): self.db.tracked.save(tracked, safe=True)
return tracked['_id'] return tracked['_id']
else: else:
self.log('track', url=url, error='tracking conflict') # id existed but with different URL
raise ValueError('%s is already tracked with different ' error = ('%s already exists with different data (tracked: '
'metadata: (tracked: %r) (new: %r)' % '%s, %s) (new: %s, %s)' % (tracked['_id'],
(url, tracked['metadata'], kwargs)) tracked['url'],
tracked['doc_class'],
url, doc_class))
self.log('track', url=url, error=error)
raise ValueError(error)
self.log('track', url=url) self.log('track', url=url)
@ -117,7 +130,7 @@ class Kernel(object):
versions=[], metadata=kwargs) versions=[], metadata=kwargs)
if id: if id:
newdoc['_id'] = id newdoc['_id'] = id
return self.db.tracked.insert(newdoc) return self.db.tracked.insert(newdoc, safe=True)
def md5_versioning(self, olddata, newdata): def md5_versioning(self, olddata, newdata):
""" return True if md5 changed or if file is new """ """ return True if md5 changed or if file is new """

View File

@ -8,7 +8,13 @@ class S3Storage(object):
def __init__(self, kernel): def __init__(self, kernel):
self.s3conn = boto.connect_s3(settings.AWS_KEY, settings.AWS_SECRET) self.s3conn = boto.connect_s3(settings.AWS_KEY, settings.AWS_SECRET)
self.bucket = self.s3conn.create_bucket(settings.AWS_BUCKET) self._bucket = False
@property
def bucket(self):
if not self._bucket:
self._bucket = self.s3conn.create_bucket(settings.AWS_BUCKET)
return self._bucket
def put(self, tracked_doc, data, content_type): def put(self, tracked_doc, data, content_type):
""" upload the document to S3 """ """ upload the document to S3 """

View File

@ -83,28 +83,27 @@ class KernelTests(TestCase):
id2 = self.kernel.track_url('http://example.com', 'default', pi=3) id2 = self.kernel.track_url('http://example.com', 'default', pi=3)
assert id1 == id2 assert id1 == id2
# test setting id # test manually set id
out = self.kernel.track_url('http://example.com/2', 'default', out = self.kernel.track_url('http://example.com/2', 'default',
'fixed-id') 'fixed-id')
assert out == 'fixed-id' assert out == 'fixed-id'
# can't track same URL twice with different id # can't pass track same id twice with different url
assert_raises(ValueError, self.kernel.track_url, 'http://example.com', self.kernel.db.logs.drop()
'default', 'hard-coded-id') assert_raises(ValueError, self.kernel.track_url,
# logged error 'http://example.com/3', 'default', 'fixed-id')
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'}) assert 'already exists' in self.kernel.db.logs.find_one()['error']
# ... with different metadata # ... or different doc class
assert_raises(ValueError, self.kernel.track_url, 'http://example.com', self.kernel.db.logs.drop()
'default') assert_raises(ValueError, self.kernel.track_url,
# logged error 'http://example.com/2', 'change-hook', 'fixed-id')
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'}) assert 'already exists' in self.kernel.db.logs.find_one()['error']
# ... different doc class # different metadata is ok, but it should be updated
assert_raises(ValueError, self.kernel.track_url, 'http://example.com', self.kernel.track_url('http://example.com/2', 'default', 'fixed-id',
'special-doc-class', pi=3) pi=3)
# logged error self.kernel.db.tracked.find_one({'_id': 'fixed-id'})['metadata']['pi'] == 3
assert self.kernel.db.logs.find_one({'error': 'tracking conflict'})
def test_no_update(self): def test_no_update(self):
# update # update

View File

@ -1,3 +1,6 @@
from nose.plugins.skip import SkipTest
from oyster.conf import settings
from oyster.core import Kernel from oyster.core import Kernel
from oyster.storage.s3 import S3Storage from oyster.storage.s3 import S3Storage
from oyster.storage.gridfs import GridFSStorage from oyster.storage.gridfs import GridFSStorage
@ -21,6 +24,8 @@ def _simple_storage_test(StorageCls):
def test_s3(): def test_s3():
#if not hasattr(settings, 'AWS_BUCKET'):
# raise SkipTest('S3 not configured')
_simple_storage_test(S3Storage) _simple_storage_test(S3Storage)