From 15730432372b8a27bb71e88475274428a3d4da7f Mon Sep 17 00:00:00 2001 From: James Turk Date: Tue, 2 Aug 2011 14:47:19 -0400 Subject: [PATCH] oysterd configurability --- oyster/bin/oysterd.py | 41 ++++++++++++++++++++++++++++++++++++----- oyster/client.py | 18 ++++++++---------- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/oyster/bin/oysterd.py b/oyster/bin/oysterd.py index 2289837..522d9b9 100644 --- a/oyster/bin/oysterd.py +++ b/oyster/bin/oysterd.py @@ -1,3 +1,4 @@ +import argparse import multiprocessing import threading import time @@ -33,16 +34,46 @@ class UpdateProcess(multiprocessing.Process): self.task_q.task_done() -def flask_process(): - app.run(debug=True) def main(): - num_processes = 4 - debug = True + parser = argparse.ArgumentParser(description='oyster daemon') + parser.add_argument('-w', '--workers', type=int, default=0, + help='number of worker processes to use (default: # processors)') + parser.add_argument('-p', '--port', type=int, default=31687, + help='port for HTTP service to run on') + parser.add_argument('--debug', action='store_true', + help='enable debug mode') + # client config options + parser.add_argument('--mongo_host', default='localhost', + help='host or IP for mongodb server') + parser.add_argument('--mongo_port', default=27017, + help='port for mongodb server') + parser.add_argument('--mongo_db', default='oyster', + help='mongodb database name') + parser.add_argument('--logsize', default=100000000, + help='mongodb maximum log size (bytes)') + parser.add_argument('--useragent', default='oyster', + help='user agent to use when fetching pages') + parser.add_argument('--rpm', default=600, + help='maximum requests per minute to make') + parser.add_argument('--timeout', default=None, + help='timeout (seconds) when making requests (default: none)') + parser.add_argument('--retry_attempts', default=0, + help='retries when making requests (default: 0)') + parser.add_argument('--retry_wait', default=5, + help='retry wait period (seconds) when making requests (default: 5)') + args = parser.parse_args() + + # workers defaults to cpu_count + if not args.workers: + args.workers = multiprocessing.cpu_count() + workers = [UpdateProcess(work_queue) for i in xrange(args.workers)] work_queue = multiprocessing.JoinableQueue() - workers = [UpdateProcess(work_queue) for i in xrange(num_processes)] + + def flask_process(): + app.run(debug=args.debug, port=args.port) server = multiprocessing.Process(target=flask_process) # give flask access to our work_queue diff --git a/oyster/client.py b/oyster/client.py index b7cfc27..0e2564a 100644 --- a/oyster/client.py +++ b/oyster/client.py @@ -13,26 +13,24 @@ class Client(object): def __init__(self, mongo_host='localhost', mongo_port=27017, - mongo_db='oyster', gridfs_collection='fs', - mongo_log_maxsize=100000000, - user_agent='oyster', rpm=600, follow_robots=False, - raise_errors=True, timeout=None, retry_attempts=0, - retry_wait_seconds=5): + mongo_db='oyster', mongo_log_maxsize=100000000, + user_agent='oyster', rpm=600, timeout=None, + retry_attempts=0, retry_wait_seconds=5): self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db] try: self.db.create_collection('logs', capped=True, size=mongo_log_maxsize) except pymongo.errors.CollectionInvalid: pass - self.fs = gridfs.GridFS(self.db, gridfs_collection) - self._collection_name = gridfs_collection + self._collection_name = 'fs' + self.fs = gridfs.GridFS(self.db, self._collection_name) self.scraper = scrapelib.Scraper(user_agent=user_agent, requests_per_minute=rpm, follow_robots=False, raise_errors=True, - timeout=None, - retry_attempts=0, - retry_wait_seconds=5 + timeout=timeout, + retry_attempts=retry_attempts, + retry_wait_seconds=retry_wait_seconds )