change how retries work

This commit is contained in:
James Turk 2011-09-20 14:51:36 -04:00
parent dd622ab250
commit 5da67c03da
4 changed files with 22 additions and 19 deletions

View File

@ -20,7 +20,7 @@ def get_configured_client():
rpm=settings.REQUESTS_PER_MINUTE,
timeout=settings.REQUEST_TIMEOUT,
retry_attempts=settings.RETRY_ATTEMPTS,
retry_wait_seconds=settings.RETRY_WAIT_SECONDS)
retry_wait_minutes=settings.RETRY_WAIT_MINUTES)
class Client(object):
@ -29,7 +29,7 @@ class Client(object):
def __init__(self, mongo_host='localhost', mongo_port=27017,
mongo_db='oyster', mongo_log_maxsize=100000000,
user_agent='oyster', rpm=600, timeout=None,
retry_attempts=0, retry_wait_seconds=5):
retry_attempts=100, retry_wait_minutes=1/60.):
# set up a capped log if it doesn't exist
self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
@ -50,9 +50,11 @@ class Client(object):
follow_robots=False,
raise_errors=True,
timeout=timeout,
retry_attempts=retry_attempts,
retry_wait_seconds=retry_wait_seconds
# disable scrapelib's retries
retry_attempts=0,
retry_wait_seconds=0,
)
self.retry_wait_minutes = retry_wait_minutes
def _wipe(self):
@ -131,14 +133,18 @@ class Client(object):
self.fs.put(data, filename=doc['url'], content_type=content_type,
**doc['metadata'])
if error:
c_errors = doc.get('consecutive_errors', 0)
doc['consecutive_errors'] = c_errors + 1
update_mins = self.retry_wait_minutes * (2**c_errors)
else:
doc['consecutive_errors'] = 0
update_mins = doc['update_mins']
# last_update/next_update are separate from question of versioning
doc['last_update'] = datetime.datetime.utcnow()
doc['next_update'] = (doc['last_update'] +
datetime.timedelta(minutes=doc['update_mins']))
if error:
doc['consecutive_errors'] = doc.get('consecutive_errors', 0) + 1
else:
doc['consecutive_errors'] = 0
datetime.timedelta(minutes=update_mins))
self.log('update', url=url, new_doc=do_put, error=error)

View File

@ -9,4 +9,4 @@ USER_AGENT = 'oyster'
REQUESTS_PER_MINUTE = 300
REQUEST_TIMEOUT = 0
RETRY_ATTEMPTS = 0
RETRY_WAIT_SECONDS = 5
RETRY_WAIT_MINUTES = 60

View File

@ -27,10 +27,7 @@ class UpdateTaskScheduler(PeriodicTask):
# 60s tick
run_every = 60
def __init__(self):
self.client = get_configured_client()
client = get_configured_client()
def run(self):
# if the update queue isn't empty, wait to add more
@ -41,5 +38,5 @@ class UpdateTaskScheduler(PeriodicTask):
next_set = self.client.get_update_queue()
for doc in next_set:
update_task.delay(doc['_id'])
UpdateTask.delay(doc['_id'])
self.client.db.status.update({}, {'$inc': {'update_queue': 1}})

View File

@ -11,23 +11,23 @@ from oyster.client import Client
class ClientTests(TestCase):
def setUp(self):
self.client = Client(mongo_db='oyster_test')
self.client = Client(mongo_db='oyster_test', retry_wait_minutes=1/60.)
self.client._wipe()
def test_constructor(self):
c = Client('127.0.0.1', 27017, 'testdb', mongo_log_maxsize=5000,
user_agent='test-ua', rpm=30, timeout=60,
retry_attempts=1, retry_wait_seconds=10)
retry_attempts=7, retry_wait_minutes=8)
assert c.db.connection.host == '127.0.0.1'
assert c.db.connection.port == 27017
assert c.db.logs.options()['capped'] == True
assert c.db.logs.options()['size'] == 5000
assert c.retry_wait_minutes == 8
# TODO: test retry_attempts
assert c.scraper.user_agent == 'test-ua'
assert c.scraper.requests_per_minute == 30
assert c.scraper.timeout == 60
assert c.scraper.retry_attempts == 1
assert c.scraper.retry_wait_seconds == 10
def test_log(self):
self.client.log('action1', 'http://example.com')