change how retries work
This commit is contained in:
parent
dd622ab250
commit
5da67c03da
@ -20,7 +20,7 @@ def get_configured_client():
|
||||
rpm=settings.REQUESTS_PER_MINUTE,
|
||||
timeout=settings.REQUEST_TIMEOUT,
|
||||
retry_attempts=settings.RETRY_ATTEMPTS,
|
||||
retry_wait_seconds=settings.RETRY_WAIT_SECONDS)
|
||||
retry_wait_minutes=settings.RETRY_WAIT_MINUTES)
|
||||
|
||||
|
||||
class Client(object):
|
||||
@ -29,7 +29,7 @@ class Client(object):
|
||||
def __init__(self, mongo_host='localhost', mongo_port=27017,
|
||||
mongo_db='oyster', mongo_log_maxsize=100000000,
|
||||
user_agent='oyster', rpm=600, timeout=None,
|
||||
retry_attempts=0, retry_wait_seconds=5):
|
||||
retry_attempts=100, retry_wait_minutes=1/60.):
|
||||
|
||||
# set up a capped log if it doesn't exist
|
||||
self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
|
||||
@ -50,9 +50,11 @@ class Client(object):
|
||||
follow_robots=False,
|
||||
raise_errors=True,
|
||||
timeout=timeout,
|
||||
retry_attempts=retry_attempts,
|
||||
retry_wait_seconds=retry_wait_seconds
|
||||
# disable scrapelib's retries
|
||||
retry_attempts=0,
|
||||
retry_wait_seconds=0,
|
||||
)
|
||||
self.retry_wait_minutes = retry_wait_minutes
|
||||
|
||||
|
||||
def _wipe(self):
|
||||
@ -131,14 +133,18 @@ class Client(object):
|
||||
self.fs.put(data, filename=doc['url'], content_type=content_type,
|
||||
**doc['metadata'])
|
||||
|
||||
if error:
|
||||
c_errors = doc.get('consecutive_errors', 0)
|
||||
doc['consecutive_errors'] = c_errors + 1
|
||||
update_mins = self.retry_wait_minutes * (2**c_errors)
|
||||
else:
|
||||
doc['consecutive_errors'] = 0
|
||||
update_mins = doc['update_mins']
|
||||
|
||||
# last_update/next_update are separate from question of versioning
|
||||
doc['last_update'] = datetime.datetime.utcnow()
|
||||
doc['next_update'] = (doc['last_update'] +
|
||||
datetime.timedelta(minutes=doc['update_mins']))
|
||||
if error:
|
||||
doc['consecutive_errors'] = doc.get('consecutive_errors', 0) + 1
|
||||
else:
|
||||
doc['consecutive_errors'] = 0
|
||||
datetime.timedelta(minutes=update_mins))
|
||||
|
||||
self.log('update', url=url, new_doc=do_put, error=error)
|
||||
|
||||
|
@ -9,4 +9,4 @@ USER_AGENT = 'oyster'
|
||||
REQUESTS_PER_MINUTE = 300
|
||||
REQUEST_TIMEOUT = 0
|
||||
RETRY_ATTEMPTS = 0
|
||||
RETRY_WAIT_SECONDS = 5
|
||||
RETRY_WAIT_MINUTES = 60
|
||||
|
@ -27,10 +27,7 @@ class UpdateTaskScheduler(PeriodicTask):
|
||||
|
||||
# 60s tick
|
||||
run_every = 60
|
||||
|
||||
def __init__(self):
|
||||
self.client = get_configured_client()
|
||||
|
||||
client = get_configured_client()
|
||||
|
||||
def run(self):
|
||||
# if the update queue isn't empty, wait to add more
|
||||
@ -41,5 +38,5 @@ class UpdateTaskScheduler(PeriodicTask):
|
||||
|
||||
next_set = self.client.get_update_queue()
|
||||
for doc in next_set:
|
||||
update_task.delay(doc['_id'])
|
||||
UpdateTask.delay(doc['_id'])
|
||||
self.client.db.status.update({}, {'$inc': {'update_queue': 1}})
|
||||
|
@ -11,23 +11,23 @@ from oyster.client import Client
|
||||
class ClientTests(TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.client = Client(mongo_db='oyster_test')
|
||||
self.client = Client(mongo_db='oyster_test', retry_wait_minutes=1/60.)
|
||||
self.client._wipe()
|
||||
|
||||
|
||||
def test_constructor(self):
|
||||
c = Client('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000,
|
||||
user_agent='test-ua', rpm=30, timeout=60,
|
||||
retry_attempts=1, retry_wait_seconds=10)
|
||||
retry_attempts=7, retry_wait_minutes=8)
|
||||
assert c.db.connection.host == '127.0.0.1'
|
||||
assert c.db.connection.port == 27017
|
||||
assert c.db.logs.options()['capped'] == True
|
||||
assert c.db.logs.options()['size'] == 5000
|
||||
assert c.retry_wait_minutes == 8
|
||||
# TODO: test retry_attempts
|
||||
assert c.scraper.user_agent == 'test-ua'
|
||||
assert c.scraper.requests_per_minute == 30
|
||||
assert c.scraper.timeout == 60
|
||||
assert c.scraper.retry_attempts == 1
|
||||
assert c.scraper.retry_wait_seconds == 10
|
||||
|
||||
def test_log(self):
|
||||
self.client.log('action1', 'http://example.com')
|
||||
|
Loading…
Reference in New Issue
Block a user