From 5da67c03daeabd2f8deae8bb0ff2855c6c6503f2 Mon Sep 17 00:00:00 2001 From: James Turk Date: Tue, 20 Sep 2011 14:51:36 -0400 Subject: [PATCH] change how retries work --- oyster/client.py | 24 +++++++++++++++--------- oyster/conf/default_settings.py | 2 +- oyster/tasks.py | 7 ++----- oyster/tests/test_client.py | 8 ++++---- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/oyster/client.py b/oyster/client.py index e92aa34..aaf34a1 100644 --- a/oyster/client.py +++ b/oyster/client.py @@ -20,7 +20,7 @@ def get_configured_client(): rpm=settings.REQUESTS_PER_MINUTE, timeout=settings.REQUEST_TIMEOUT, retry_attempts=settings.RETRY_ATTEMPTS, - retry_wait_seconds=settings.RETRY_WAIT_SECONDS) + retry_wait_minutes=settings.RETRY_WAIT_MINUTES) class Client(object): @@ -29,7 +29,7 @@ class Client(object): def __init__(self, mongo_host='localhost', mongo_port=27017, mongo_db='oyster', mongo_log_maxsize=100000000, user_agent='oyster', rpm=600, timeout=None, - retry_attempts=0, retry_wait_seconds=5): + retry_attempts=100, retry_wait_minutes=1/60.): # set up a capped log if it doesn't exist self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db] @@ -50,9 +50,11 @@ class Client(object): follow_robots=False, raise_errors=True, timeout=timeout, - retry_attempts=retry_attempts, - retry_wait_seconds=retry_wait_seconds + # disable scrapelib's retries + retry_attempts=0, + retry_wait_seconds=0, ) + self.retry_wait_minutes = retry_wait_minutes def _wipe(self): @@ -131,14 +133,18 @@ class Client(object): self.fs.put(data, filename=doc['url'], content_type=content_type, **doc['metadata']) + if error: + c_errors = doc.get('consecutive_errors', 0) + doc['consecutive_errors'] = c_errors + 1 + update_mins = self.retry_wait_minutes * (2**c_errors) + else: + doc['consecutive_errors'] = 0 + update_mins = doc['update_mins'] + # last_update/next_update are separate from question of versioning doc['last_update'] = datetime.datetime.utcnow() doc['next_update'] = (doc['last_update'] + - datetime.timedelta(minutes=doc['update_mins'])) - if error: - doc['consecutive_errors'] = doc.get('consecutive_errors', 0) + 1 - else: - doc['consecutive_errors'] = 0 + datetime.timedelta(minutes=update_mins)) self.log('update', url=url, new_doc=do_put, error=error) diff --git a/oyster/conf/default_settings.py b/oyster/conf/default_settings.py index 0bc138f..6c520e8 100644 --- a/oyster/conf/default_settings.py +++ b/oyster/conf/default_settings.py @@ -9,4 +9,4 @@ USER_AGENT = 'oyster' REQUESTS_PER_MINUTE = 300 REQUEST_TIMEOUT = 0 RETRY_ATTEMPTS = 0 -RETRY_WAIT_SECONDS = 5 +RETRY_WAIT_MINUTES = 60 diff --git a/oyster/tasks.py b/oyster/tasks.py index 1a1ea22..82349dd 100644 --- a/oyster/tasks.py +++ b/oyster/tasks.py @@ -27,10 +27,7 @@ class UpdateTaskScheduler(PeriodicTask): # 60s tick run_every = 60 - - def __init__(self): - self.client = get_configured_client() - + client = get_configured_client() def run(self): # if the update queue isn't empty, wait to add more @@ -41,5 +38,5 @@ class UpdateTaskScheduler(PeriodicTask): next_set = self.client.get_update_queue() for doc in next_set: - update_task.delay(doc['_id']) + UpdateTask.delay(doc['_id']) self.client.db.status.update({}, {'$inc': {'update_queue': 1}}) diff --git a/oyster/tests/test_client.py b/oyster/tests/test_client.py index 569e8eb..648264e 100644 --- a/oyster/tests/test_client.py +++ b/oyster/tests/test_client.py @@ -11,23 +11,23 @@ from oyster.client import Client class ClientTests(TestCase): def setUp(self): - self.client = Client(mongo_db='oyster_test') + self.client = Client(mongo_db='oyster_test', retry_wait_minutes=1/60.) self.client._wipe() def test_constructor(self): c = Client('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000, user_agent='test-ua', rpm=30, timeout=60, - retry_attempts=1, retry_wait_seconds=10) + retry_attempts=7, retry_wait_minutes=8) assert c.db.connection.host == '127.0.0.1' assert c.db.connection.port == 27017 assert c.db.logs.options()['capped'] == True assert c.db.logs.options()['size'] == 5000 + assert c.retry_wait_minutes == 8 + # TODO: test retry_attempts assert c.scraper.user_agent == 'test-ua' assert c.scraper.requests_per_minute == 30 assert c.scraper.timeout == 60 - assert c.scraper.retry_attempts == 1 - assert c.scraper.retry_wait_seconds == 10 def test_log(self): self.client.log('action1', 'http://example.com')