ExternalStoreTask takes an extract_text argument
This commit is contained in:
parent
e6c9cd6a06
commit
5b71fa4aea
@ -64,13 +64,14 @@ class ExternalStoreTask(Task):
|
|||||||
# one client per process
|
# one client per process
|
||||||
self.client = get_configured_client()
|
self.client = get_configured_client()
|
||||||
|
|
||||||
def run(self, doc_id):
|
def run(self, doc_id, extract_text=lambda x: x):
|
||||||
# get the document
|
# get the document
|
||||||
doc = self.client.db.tracked.find_one({'_id': ObjectId(doc_id)})
|
doc = self.client.db.tracked.find_one({'_id': ObjectId(doc_id)})
|
||||||
filedata = self.client.get_version(doc['url']).read()
|
filedata = self.client.get_version(doc['url']).read()
|
||||||
|
text = extract_text(filedata)
|
||||||
|
|
||||||
# put the document into the data store
|
# put the document into the data store
|
||||||
result = self.upload_document(doc_id, filedata, doc['metadata'])
|
result = self.upload_document(doc_id, extract_text, doc['metadata'])
|
||||||
|
|
||||||
doc[self.external_store + '_id'] = result
|
doc[self.external_store + '_id'] = result
|
||||||
self.client.db.tracked.save(doc, safe=True)
|
self.client.db.tracked.save(doc, safe=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user