change how retries work
This commit is contained in:
		
							parent
							
								
									dd622ab250
								
							
						
					
					
						commit
						5da67c03da
					
				
					 4 changed files with 22 additions and 19 deletions
				
			
		| 
						 | 
				
			
			@ -20,7 +20,7 @@ def get_configured_client():
 | 
			
		|||
                  rpm=settings.REQUESTS_PER_MINUTE,
 | 
			
		||||
                  timeout=settings.REQUEST_TIMEOUT,
 | 
			
		||||
                  retry_attempts=settings.RETRY_ATTEMPTS,
 | 
			
		||||
                  retry_wait_seconds=settings.RETRY_WAIT_SECONDS)
 | 
			
		||||
                  retry_wait_minutes=settings.RETRY_WAIT_MINUTES)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Client(object):
 | 
			
		||||
| 
						 | 
				
			
			@ -29,7 +29,7 @@ class Client(object):
 | 
			
		|||
    def __init__(self, mongo_host='localhost', mongo_port=27017,
 | 
			
		||||
                 mongo_db='oyster', mongo_log_maxsize=100000000,
 | 
			
		||||
                 user_agent='oyster', rpm=600, timeout=None,
 | 
			
		||||
                 retry_attempts=0, retry_wait_seconds=5):
 | 
			
		||||
                 retry_attempts=100, retry_wait_minutes=1/60.):
 | 
			
		||||
 | 
			
		||||
        # set up a capped log if it doesn't exist
 | 
			
		||||
        self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
 | 
			
		||||
| 
						 | 
				
			
			@ -50,9 +50,11 @@ class Client(object):
 | 
			
		|||
                                         follow_robots=False,
 | 
			
		||||
                                         raise_errors=True,
 | 
			
		||||
                                         timeout=timeout,
 | 
			
		||||
                                         retry_attempts=retry_attempts,
 | 
			
		||||
                                         retry_wait_seconds=retry_wait_seconds
 | 
			
		||||
                                         # disable scrapelib's retries
 | 
			
		||||
                                         retry_attempts=0,
 | 
			
		||||
                                         retry_wait_seconds=0,
 | 
			
		||||
                                        )
 | 
			
		||||
        self.retry_wait_minutes = retry_wait_minutes
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def _wipe(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -131,14 +133,18 @@ class Client(object):
 | 
			
		|||
            self.fs.put(data, filename=doc['url'], content_type=content_type,
 | 
			
		||||
                        **doc['metadata'])
 | 
			
		||||
 | 
			
		||||
        if error:
 | 
			
		||||
            c_errors = doc.get('consecutive_errors', 0)
 | 
			
		||||
            doc['consecutive_errors'] = c_errors + 1
 | 
			
		||||
            update_mins = self.retry_wait_minutes * (2**c_errors)
 | 
			
		||||
        else:
 | 
			
		||||
            doc['consecutive_errors'] = 0
 | 
			
		||||
            update_mins = doc['update_mins']
 | 
			
		||||
 | 
			
		||||
        # last_update/next_update are separate from question of versioning
 | 
			
		||||
        doc['last_update'] = datetime.datetime.utcnow()
 | 
			
		||||
        doc['next_update'] = (doc['last_update'] +
 | 
			
		||||
                              datetime.timedelta(minutes=doc['update_mins']))
 | 
			
		||||
        if error:
 | 
			
		||||
            doc['consecutive_errors'] = doc.get('consecutive_errors', 0) + 1
 | 
			
		||||
        else:
 | 
			
		||||
            doc['consecutive_errors'] = 0
 | 
			
		||||
                              datetime.timedelta(minutes=update_mins))
 | 
			
		||||
 | 
			
		||||
        self.log('update', url=url, new_doc=do_put, error=error)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,4 +9,4 @@ USER_AGENT = 'oyster'
 | 
			
		|||
REQUESTS_PER_MINUTE = 300
 | 
			
		||||
REQUEST_TIMEOUT = 0
 | 
			
		||||
RETRY_ATTEMPTS = 0
 | 
			
		||||
RETRY_WAIT_SECONDS = 5
 | 
			
		||||
RETRY_WAIT_MINUTES = 60
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,10 +27,7 @@ class UpdateTaskScheduler(PeriodicTask):
 | 
			
		|||
 | 
			
		||||
    # 60s tick
 | 
			
		||||
    run_every = 60
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.client = get_configured_client()
 | 
			
		||||
 | 
			
		||||
    client = get_configured_client()
 | 
			
		||||
 | 
			
		||||
    def run(self):
 | 
			
		||||
        # if the update queue isn't empty, wait to add more
 | 
			
		||||
| 
						 | 
				
			
			@ -41,5 +38,5 @@ class UpdateTaskScheduler(PeriodicTask):
 | 
			
		|||
 | 
			
		||||
        next_set = self.client.get_update_queue()
 | 
			
		||||
        for doc in next_set:
 | 
			
		||||
            update_task.delay(doc['_id'])
 | 
			
		||||
            UpdateTask.delay(doc['_id'])
 | 
			
		||||
            self.client.db.status.update({}, {'$inc': {'update_queue': 1}})
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,23 +11,23 @@ from oyster.client import Client
 | 
			
		|||
class ClientTests(TestCase):
 | 
			
		||||
 | 
			
		||||
    def setUp(self):
 | 
			
		||||
        self.client = Client(mongo_db='oyster_test')
 | 
			
		||||
        self.client = Client(mongo_db='oyster_test', retry_wait_minutes=1/60.)
 | 
			
		||||
        self.client._wipe()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def test_constructor(self):
 | 
			
		||||
        c = Client('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000,
 | 
			
		||||
                   user_agent='test-ua', rpm=30, timeout=60,
 | 
			
		||||
                   retry_attempts=1, retry_wait_seconds=10)
 | 
			
		||||
                   retry_attempts=7, retry_wait_minutes=8)
 | 
			
		||||
        assert c.db.connection.host == '127.0.0.1'
 | 
			
		||||
        assert c.db.connection.port == 27017
 | 
			
		||||
        assert c.db.logs.options()['capped'] == True
 | 
			
		||||
        assert c.db.logs.options()['size'] == 5000
 | 
			
		||||
        assert c.retry_wait_minutes == 8
 | 
			
		||||
        # TODO: test retry_attempts
 | 
			
		||||
        assert c.scraper.user_agent == 'test-ua'
 | 
			
		||||
        assert c.scraper.requests_per_minute == 30
 | 
			
		||||
        assert c.scraper.timeout == 60
 | 
			
		||||
        assert c.scraper.retry_attempts == 1
 | 
			
		||||
        assert c.scraper.retry_wait_seconds == 10
 | 
			
		||||
 | 
			
		||||
    def test_log(self):
 | 
			
		||||
        self.client.log('action1', 'http://example.com')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue