diff --git a/oyster/core.py b/oyster/core.py index 86fcb55..f6441d1 100644 --- a/oyster/core.py +++ b/oyster/core.py @@ -91,24 +91,37 @@ class Kernel(object): any keyword args will be added to the document's metadata """ if doc_class not in self.doc_classes: - raise ValueError('unregistered doc_class %s' % doc_class) + error = 'unregistered doc_class %s' % doc_class + self.log('track', url=url, error=error) + raise ValueError(error) - tracked = self.db.tracked.find_one({'url': url}) + # try and find an existing version of this document + tracked = None - # if data is already tracked and this is just a duplicate call - # return the original object + if id: + tracked = self.db.tracked.find_one({'_id': id}) + + if not tracked: + tracked = self.db.tracked.find_one({'url': url}) + + # if id exists, ensure that URL and doc_class haven't changed + # then return existing data (possibly with refreshed metadata) if tracked: - # only check id if id was passed in - id_matches = (tracked['_id'] == id) if id else True - if (tracked['metadata'] == kwargs and - tracked['doc_class'] == doc_class and - id_matches): + if (tracked['url'] == url and + tracked['doc_class'] == doc_class): + if kwargs != tracked['metadata']: + tracked['metadata'] = kwargs + self.db.tracked.save(tracked, safe=True) return tracked['_id'] else: - self.log('track', url=url, error='tracking conflict') - raise ValueError('%s is already tracked with different ' - 'metadata: (tracked: %r) (new: %r)' % - (url, tracked['metadata'], kwargs)) + # id existed but with different URL + error = ('%s already exists with different data (tracked: ' + '%s, %s) (new: %s, %s)' % (tracked['_id'], + tracked['url'], + tracked['doc_class'], + url, doc_class)) + self.log('track', url=url, error=error) + raise ValueError(error) self.log('track', url=url) @@ -117,7 +130,7 @@ class Kernel(object): versions=[], metadata=kwargs) if id: newdoc['_id'] = id - return self.db.tracked.insert(newdoc) + return self.db.tracked.insert(newdoc, safe=True) def md5_versioning(self, olddata, newdata): """ return True if md5 changed or if file is new """ diff --git a/oyster/storage/s3.py b/oyster/storage/s3.py index f77e9a6..d47901a 100644 --- a/oyster/storage/s3.py +++ b/oyster/storage/s3.py @@ -8,7 +8,13 @@ class S3Storage(object): def __init__(self, kernel): self.s3conn = boto.connect_s3(settings.AWS_KEY, settings.AWS_SECRET) - self.bucket = self.s3conn.create_bucket(settings.AWS_BUCKET) + self._bucket = False + + @property + def bucket(self): + if not self._bucket: + self._bucket = self.s3conn.create_bucket(settings.AWS_BUCKET) + return self._bucket def put(self, tracked_doc, data, content_type): """ upload the document to S3 """ diff --git a/oyster/tests/test_kernel.py b/oyster/tests/test_kernel.py index 90a4719..9fa95a6 100644 --- a/oyster/tests/test_kernel.py +++ b/oyster/tests/test_kernel.py @@ -83,28 +83,27 @@ class KernelTests(TestCase): id2 = self.kernel.track_url('http://example.com', 'default', pi=3) assert id1 == id2 - # test setting id + # test manually set id out = self.kernel.track_url('http://example.com/2', 'default', 'fixed-id') assert out == 'fixed-id' - # can't track same URL twice with different id - assert_raises(ValueError, self.kernel.track_url, 'http://example.com', - 'default', 'hard-coded-id') - # logged error - assert self.kernel.db.logs.find_one({'error': 'tracking conflict'}) + # can't pass track same id twice with different url + self.kernel.db.logs.drop() + assert_raises(ValueError, self.kernel.track_url, + 'http://example.com/3', 'default', 'fixed-id') + assert 'already exists' in self.kernel.db.logs.find_one()['error'] - # ... with different metadata - assert_raises(ValueError, self.kernel.track_url, 'http://example.com', - 'default') - # logged error - assert self.kernel.db.logs.find_one({'error': 'tracking conflict'}) + # ... or different doc class + self.kernel.db.logs.drop() + assert_raises(ValueError, self.kernel.track_url, + 'http://example.com/2', 'change-hook', 'fixed-id') + assert 'already exists' in self.kernel.db.logs.find_one()['error'] - # ... different doc class - assert_raises(ValueError, self.kernel.track_url, 'http://example.com', - 'special-doc-class', pi=3) - # logged error - assert self.kernel.db.logs.find_one({'error': 'tracking conflict'}) + # different metadata is ok, but it should be updated + self.kernel.track_url('http://example.com/2', 'default', 'fixed-id', + pi=3) + self.kernel.db.tracked.find_one({'_id': 'fixed-id'})['metadata']['pi'] == 3 def test_no_update(self): # update diff --git a/oyster/tests/test_storage.py b/oyster/tests/test_storage.py index 23c7d77..9e92540 100644 --- a/oyster/tests/test_storage.py +++ b/oyster/tests/test_storage.py @@ -1,3 +1,6 @@ +from nose.plugins.skip import SkipTest + +from oyster.conf import settings from oyster.core import Kernel from oyster.storage.s3 import S3Storage from oyster.storage.gridfs import GridFSStorage @@ -21,6 +24,8 @@ def _simple_storage_test(StorageCls): def test_s3(): + #if not hasattr(settings, 'AWS_BUCKET'): + # raise SkipTest('S3 not configured') _simple_storage_test(S3Storage)