switch from BeautifulSoup to lxml

This commit is contained in:
Jeremy Carbaugh 2012-03-12 16:09:15 -07:00
parent c7544204d8
commit e0e4dd43e2
3 changed files with 13 additions and 26 deletions

View File

@ -92,30 +92,32 @@ class HtmlTableSource(object):
def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
# extract the table
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(htmlfile.read())
from lxml.html import parse
doc = parse(htmlfile).getroot()
if isinstance(id_or_num, int):
table = soup.findAll('table')[id_or_num]
table = doc.cssselect('table')[id_or_num]
else:
table = soup.find('table', id=id_or_num)
table = doc.cssselect('table#%s' % id_or_num)
table = table[0] # get the first table
# skip the necessary number of rows
self._rows = table.findAll('tr')[skiprows:]
self._rows = table.cssselect('tr')[skiprows:]
# determine the fieldnames
if not fieldnames:
self._fieldnames = [td.string
for td in self._rows[0].findAll(('td','th'))]
self._fieldnames = [td.text_content()
for td in self._rows[0].cssselect('td, th')]
skiprows += 1
else:
self._fieldnames = fieldnames
# skip the necessary number of rows
self._rows = table.findAll('tr')[skiprows:]
self._rows = table.cssselect('tr')[skiprows:]
def process_tr(self):
for row in self._rows:
strings = [utils.string_dig(td) for td in row.findAll('td')]
strings = [td.text_content() for td in row.cssselect('td')]
yield dict(zip(self._fieldnames, strings))
def __iter__(self):

View File

@ -65,13 +65,13 @@ class SourceTestCase(unittest.TestCase):
try:
from BeautifulSoup import BeautifulSoup
import lxml
hts = HtmlTableSource(content, 'thetable')
self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}])
except ImportError:
self.skipTest("BeautifulSoup is not installed")
self.skipTest("lxml is not installed")
if __name__ == '__main__':
unittest.main()

View File

@ -25,21 +25,6 @@ def get_django_model(dj_settings, app_label, model_name):
from django.db.models import get_model
return get_model(app_label, model_name)
def string_dig(element, separator=''):
"""
Dig into BeautifulSoup HTML elements looking for inner strings.
If element resembled: <p><b>test</b><em>test</em></p>
then string_dig(element, '~') would return test~test
"""
if element.string:
return element.string
else:
return separator.join([string_dig(child)
for child in element.findAll(True)])
def flatten(item, prefix='', separator='_', keys=None):
"""
Flatten nested dictionary into one with its keys concatenated together.