change how retries work

2011-09-20 14:51:36 -04:00 · 2011-09-20 14:51:36 -04:00 · 5da67c03da
commit 5da67c03da
parent dd622ab250
4 changed files with 22 additions and 19 deletions
--- a/oyster/client.py
+++ b/oyster/client.py
@ -20,7 +20,7 @@ def get_configured_client():
                  rpm=settings.REQUESTS_PER_MINUTE,
                  timeout=settings.REQUEST_TIMEOUT,
                  retry_attempts=settings.RETRY_ATTEMPTS,
-                  retry_wait_seconds=settings.RETRY_WAIT_SECONDS)
+                  retry_wait_minutes=settings.RETRY_WAIT_MINUTES)


 class Client(object):
@ -29,7 +29,7 @@ class Client(object):
    def __init__(self, mongo_host='localhost', mongo_port=27017,
                 mongo_db='oyster', mongo_log_maxsize=100000000,
                 user_agent='oyster', rpm=600, timeout=None,
-                 retry_attempts=0, retry_wait_seconds=5):
+                 retry_attempts=100, retry_wait_minutes=1/60.):

        # set up a capped log if it doesn't exist
        self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
@ -50,9 +50,11 @@ class Client(object):
                                         follow_robots=False,
                                         raise_errors=True,
                                         timeout=timeout,
-                                         retry_attempts=retry_attempts,
-                                         retry_wait_seconds=retry_wait_seconds
+                                         # disable scrapelib's retries
+                                         retry_attempts=0,
+                                         retry_wait_seconds=0,
                                        )
+        self.retry_wait_minutes = retry_wait_minutes


    def _wipe(self):
@ -131,14 +133,18 @@ class Client(object):
            self.fs.put(data, filename=doc['url'], content_type=content_type,
                        **doc['metadata'])

+        if error:
+            c_errors = doc.get('consecutive_errors', 0)
+            doc['consecutive_errors'] = c_errors + 1
+            update_mins = self.retry_wait_minutes * (2**c_errors)
+        else:
+            doc['consecutive_errors'] = 0
+            update_mins = doc['update_mins']
+
        # last_update/next_update are separate from question of versioning
        doc['last_update'] = datetime.datetime.utcnow()
        doc['next_update'] = (doc['last_update'] +
-                              datetime.timedelta(minutes=doc['update_mins']))
-        if error:
-            doc['consecutive_errors'] = doc.get('consecutive_errors', 0) + 1
-        else:
-            doc['consecutive_errors'] = 0
+                              datetime.timedelta(minutes=update_mins))

        self.log('update', url=url, new_doc=do_put, error=error)

--- a/oyster/conf/default_settings.py
+++ b/oyster/conf/default_settings.py
@ -9,4 +9,4 @@ USER_AGENT = 'oyster'
 REQUESTS_PER_MINUTE = 300
 REQUEST_TIMEOUT = 0
 RETRY_ATTEMPTS = 0
-RETRY_WAIT_SECONDS = 5
+RETRY_WAIT_MINUTES = 60
--- a/oyster/tasks.py
+++ b/oyster/tasks.py
@ -27,10 +27,7 @@ class UpdateTaskScheduler(PeriodicTask):

    # 60s tick
    run_every = 60
-
-    def __init__(self):
-        self.client = get_configured_client()
-
+    client = get_configured_client()

    def run(self):
        # if the update queue isn't empty, wait to add more
@ -41,5 +38,5 @@ class UpdateTaskScheduler(PeriodicTask):

        next_set = self.client.get_update_queue()
        for doc in next_set:
-            update_task.delay(doc['_id'])
+            UpdateTask.delay(doc['_id'])
            self.client.db.status.update({}, {'$inc': {'update_queue': 1}})
--- a/oyster/tests/test_client.py
+++ b/oyster/tests/test_client.py
@ -11,23 +11,23 @@ from oyster.client import Client
 class ClientTests(TestCase):

    def setUp(self):
-        self.client = Client(mongo_db='oyster_test')
+        self.client = Client(mongo_db='oyster_test', retry_wait_minutes=1/60.)
        self.client._wipe()


    def test_constructor(self):
        c = Client('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000,
                   user_agent='test-ua', rpm=30, timeout=60,
-                   retry_attempts=1, retry_wait_seconds=10)
+                   retry_attempts=7, retry_wait_minutes=8)
        assert c.db.connection.host == '127.0.0.1'
        assert c.db.connection.port == 27017
        assert c.db.logs.options()['capped'] == True
        assert c.db.logs.options()['size'] == 5000
+        assert c.retry_wait_minutes == 8
+        # TODO: test retry_attempts
        assert c.scraper.user_agent == 'test-ua'
        assert c.scraper.requests_per_minute == 30
        assert c.scraper.timeout == 60
-        assert c.scraper.retry_attempts == 1
-        assert c.scraper.retry_wait_seconds == 10

    def test_log(self):
        self.client.log('action1', 'http://example.com')