switch from BeautifulSoup to lxml
This commit is contained in:
parent
c7544204d8
commit
e0e4dd43e2
@ -92,30 +92,32 @@ class HtmlTableSource(object):
|
||||
def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
|
||||
|
||||
# extract the table
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
soup = BeautifulSoup(htmlfile.read())
|
||||
from lxml.html import parse
|
||||
doc = parse(htmlfile).getroot()
|
||||
if isinstance(id_or_num, int):
|
||||
table = soup.findAll('table')[id_or_num]
|
||||
table = doc.cssselect('table')[id_or_num]
|
||||
else:
|
||||
table = soup.find('table', id=id_or_num)
|
||||
table = doc.cssselect('table#%s' % id_or_num)
|
||||
|
||||
table = table[0] # get the first table
|
||||
|
||||
# skip the necessary number of rows
|
||||
self._rows = table.findAll('tr')[skiprows:]
|
||||
self._rows = table.cssselect('tr')[skiprows:]
|
||||
|
||||
# determine the fieldnames
|
||||
if not fieldnames:
|
||||
self._fieldnames = [td.string
|
||||
for td in self._rows[0].findAll(('td','th'))]
|
||||
self._fieldnames = [td.text_content()
|
||||
for td in self._rows[0].cssselect('td, th')]
|
||||
skiprows += 1
|
||||
else:
|
||||
self._fieldnames = fieldnames
|
||||
|
||||
# skip the necessary number of rows
|
||||
self._rows = table.findAll('tr')[skiprows:]
|
||||
self._rows = table.cssselect('tr')[skiprows:]
|
||||
|
||||
def process_tr(self):
|
||||
for row in self._rows:
|
||||
strings = [utils.string_dig(td) for td in row.findAll('td')]
|
||||
strings = [td.text_content() for td in row.cssselect('td')]
|
||||
yield dict(zip(self._fieldnames, strings))
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -65,13 +65,13 @@ class SourceTestCase(unittest.TestCase):
|
||||
|
||||
try:
|
||||
|
||||
from BeautifulSoup import BeautifulSoup
|
||||
import lxml
|
||||
|
||||
hts = HtmlTableSource(content, 'thetable')
|
||||
self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}])
|
||||
|
||||
except ImportError:
|
||||
self.skipTest("BeautifulSoup is not installed")
|
||||
self.skipTest("lxml is not installed")
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
@ -25,21 +25,6 @@ def get_django_model(dj_settings, app_label, model_name):
|
||||
from django.db.models import get_model
|
||||
return get_model(app_label, model_name)
|
||||
|
||||
|
||||
def string_dig(element, separator=''):
|
||||
"""
|
||||
Dig into BeautifulSoup HTML elements looking for inner strings.
|
||||
|
||||
If element resembled: <p><b>test</b><em>test</em></p>
|
||||
then string_dig(element, '~') would return test~test
|
||||
"""
|
||||
if element.string:
|
||||
return element.string
|
||||
else:
|
||||
return separator.join([string_dig(child)
|
||||
for child in element.findAll(True)])
|
||||
|
||||
|
||||
def flatten(item, prefix='', separator='_', keys=None):
|
||||
"""
|
||||
Flatten nested dictionary into one with its keys concatenated together.
|
||||
|
Loading…
Reference in New Issue
Block a user