basic logging
This commit is contained in:
		
							parent
							
								
									08aef40662
								
							
						
					
					
						commit
						fc1a311615
					
				
					 2 changed files with 26 additions and 5 deletions
				
			
		| 
						 | 
					@ -14,10 +14,16 @@ class Client(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, mongo_host='localhost', mongo_port=27017,
 | 
					    def __init__(self, mongo_host='localhost', mongo_port=27017,
 | 
				
			||||||
                 mongo_db='oyster', gridfs_collection='fs', 
 | 
					                 mongo_db='oyster', gridfs_collection='fs', 
 | 
				
			||||||
 | 
					                 mongo_log_maxsize=100000000,
 | 
				
			||||||
                 user_agent='oyster', rpm=600, follow_robots=False,
 | 
					                 user_agent='oyster', rpm=600, follow_robots=False,
 | 
				
			||||||
                 raise_errors=True, timeout=None, retry_attempts=0,
 | 
					                 raise_errors=True, timeout=None, retry_attempts=0,
 | 
				
			||||||
                 retry_wait_seconds=5):
 | 
					                 retry_wait_seconds=5):
 | 
				
			||||||
        self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
 | 
					        self.db = pymongo.Connection(mongo_host, mongo_port)[mongo_db]
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            self.db.create_collection('logs', capped=True,
 | 
				
			||||||
 | 
					                                      size=mongo_log_maxsize)
 | 
				
			||||||
 | 
					        except pymongo.errors.CollectionInvalid:
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
        self.fs = gridfs.GridFS(self.db, gridfs_collection)
 | 
					        self.fs = gridfs.GridFS(self.db, gridfs_collection)
 | 
				
			||||||
        self._collection_name = gridfs_collection
 | 
					        self._collection_name = gridfs_collection
 | 
				
			||||||
        self.scraper = scrapelib.Scraper(user_agent=user_agent,
 | 
					        self.scraper = scrapelib.Scraper(user_agent=user_agent,
 | 
				
			||||||
| 
						 | 
					@ -35,7 +41,13 @@ class Client(object):
 | 
				
			||||||
        self.db.drop_collection('tracked')
 | 
					        self.db.drop_collection('tracked')
 | 
				
			||||||
        self.db.drop_collection('%s.chunks' % self._collection_name)
 | 
					        self.db.drop_collection('%s.chunks' % self._collection_name)
 | 
				
			||||||
        self.db.drop_collection('%s.files' % self._collection_name)
 | 
					        self.db.drop_collection('%s.files' % self._collection_name)
 | 
				
			||||||
 | 
					        self.db.drop_collection('logs')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def log(self, action, error=False, **kwargs):
 | 
				
			||||||
 | 
					        kwargs['action'] = action
 | 
				
			||||||
 | 
					        kwargs['error'] = error
 | 
				
			||||||
 | 
					        kwargs['timestamp'] = datetime.datetime.utcnow()
 | 
				
			||||||
 | 
					        self.db.logs.insert(kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def track_url(self, url, versioning='md5', update_mins=60*24,
 | 
					    def track_url(self, url, versioning='md5', update_mins=60*24,
 | 
				
			||||||
                  **kwargs):
 | 
					                  **kwargs):
 | 
				
			||||||
| 
						 | 
					@ -46,8 +58,10 @@ class Client(object):
 | 
				
			||||||
            URL to start tracking
 | 
					            URL to start tracking
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if self.db.tracked.find_one({'url': url}):
 | 
					        if self.db.tracked.find_one({'url': url}):
 | 
				
			||||||
 | 
					            self.log('track', url=url, error='already tracked')
 | 
				
			||||||
            raise ValueError('%s is already tracked' % url)
 | 
					            raise ValueError('%s is already tracked' % url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log('track', url=url)
 | 
				
			||||||
        self.db.tracked.insert(dict(url=url, versioning=versioning,
 | 
					        self.db.tracked.insert(dict(url=url, versioning=versioning,
 | 
				
			||||||
                                    update_mins=update_mins,
 | 
					                                    update_mins=update_mins,
 | 
				
			||||||
                                    _random=random.randint(0, sys.maxint),
 | 
					                                    _random=random.randint(0, sys.maxint),
 | 
				
			||||||
| 
						 | 
					@ -73,10 +87,10 @@ class Client(object):
 | 
				
			||||||
            url = doc['url'].replace(' ', '%20')
 | 
					            url = doc['url'].replace(' ', '%20')
 | 
				
			||||||
            data = self.scraper.urlopen(url)
 | 
					            data = self.scraper.urlopen(url)
 | 
				
			||||||
            content_type = data.response.headers['content-type']
 | 
					            content_type = data.response.headers['content-type']
 | 
				
			||||||
        except scrapelib.HTTPError:
 | 
					        except scrapelib.HTTPError as e:
 | 
				
			||||||
            # TODO: log error
 | 
					            # TODO: log error
 | 
				
			||||||
            do_put = False
 | 
					            do_put = False
 | 
				
			||||||
            error = True
 | 
					            error = e
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # versioning is a concept for future use, but here's how it can work:
 | 
					        # versioning is a concept for future use, but here's how it can work:
 | 
				
			||||||
        #  versioning functions take doc & data, and return True if data is
 | 
					        #  versioning functions take doc & data, and return True if data is
 | 
				
			||||||
| 
						 | 
					@ -103,6 +117,9 @@ class Client(object):
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            doc['_consecutive_errors'] = 0
 | 
					            doc['_consecutive_errors'] = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.log('update', url=url, new_doc=do_put, error=error)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.db.tracked.save(doc, safe=True)
 | 
					        self.db.tracked.save(doc, safe=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -123,8 +140,7 @@ class Client(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_update_queue(self, max=0):
 | 
					    def get_update_queue(self, max=0):
 | 
				
			||||||
        # results are always sorted by random to avoid piling on
 | 
					        # results are always sorted by random to avoid piling on single server
 | 
				
			||||||
        # a single server
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # first we try to update anything that we've never retrieved
 | 
					        # first we try to update anything that we've never retrieved
 | 
				
			||||||
        new = self.db.tracked.find({'_next_update':
 | 
					        new = self.db.tracked.find({'_next_update':
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,6 +13,11 @@ def doc_list():
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    return flask.jsonify(**status)
 | 
					    return flask.jsonify(**status)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@app.route('/log/')
 | 
				
			||||||
 | 
					def log_view():
 | 
				
			||||||
 | 
					    logs = client.db.logs.find().sort('$natural', -1)
 | 
				
			||||||
 | 
					    return flask.render_template('logs.html', logs=logs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@app.route('/doc/<path:url>/<version>')
 | 
					@app.route('/doc/<path:url>/<version>')
 | 
				
			||||||
def show_doc(url, version):
 | 
					def show_doc(url, version):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue