From e0e4dd43e226c959469a81c32a906cb1c39dda7f Mon Sep 17 00:00:00 2001 From: Jeremy Carbaugh Date: Mon, 12 Mar 2012 16:09:15 -0700 Subject: [PATCH] switch from BeautifulSoup to lxml --- saucebrush/sources.py | 20 +++++++++++--------- saucebrush/tests/sources.py | 4 ++-- saucebrush/utils.py | 15 --------------- 3 files changed, 13 insertions(+), 26 deletions(-) diff --git a/saucebrush/sources.py b/saucebrush/sources.py index a3d272b..9ebe532 100644 --- a/saucebrush/sources.py +++ b/saucebrush/sources.py @@ -92,30 +92,32 @@ class HtmlTableSource(object): def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0): # extract the table - from BeautifulSoup import BeautifulSoup - soup = BeautifulSoup(htmlfile.read()) + from lxml.html import parse + doc = parse(htmlfile).getroot() if isinstance(id_or_num, int): - table = soup.findAll('table')[id_or_num] + table = doc.cssselect('table')[id_or_num] else: - table = soup.find('table', id=id_or_num) + table = doc.cssselect('table#%s' % id_or_num) + + table = table[0] # get the first table # skip the necessary number of rows - self._rows = table.findAll('tr')[skiprows:] + self._rows = table.cssselect('tr')[skiprows:] # determine the fieldnames if not fieldnames: - self._fieldnames = [td.string - for td in self._rows[0].findAll(('td','th'))] + self._fieldnames = [td.text_content() + for td in self._rows[0].cssselect('td, th')] skiprows += 1 else: self._fieldnames = fieldnames # skip the necessary number of rows - self._rows = table.findAll('tr')[skiprows:] + self._rows = table.cssselect('tr')[skiprows:] def process_tr(self): for row in self._rows: - strings = [utils.string_dig(td) for td in row.findAll('td')] + strings = [td.text_content() for td in row.cssselect('td')] yield dict(zip(self._fieldnames, strings)) def __iter__(self): diff --git a/saucebrush/tests/sources.py b/saucebrush/tests/sources.py index f3a07dc..1db434c 100644 --- a/saucebrush/tests/sources.py +++ b/saucebrush/tests/sources.py @@ -65,13 +65,13 @@ class SourceTestCase(unittest.TestCase): try: - from BeautifulSoup import BeautifulSoup + import lxml hts = HtmlTableSource(content, 'thetable') self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}]) except ImportError: - self.skipTest("BeautifulSoup is not installed") + self.skipTest("lxml is not installed") if __name__ == '__main__': unittest.main() diff --git a/saucebrush/utils.py b/saucebrush/utils.py index afa67dc..e30d4ef 100644 --- a/saucebrush/utils.py +++ b/saucebrush/utils.py @@ -25,21 +25,6 @@ def get_django_model(dj_settings, app_label, model_name): from django.db.models import get_model return get_model(app_label, model_name) - -def string_dig(element, separator=''): - """ - Dig into BeautifulSoup HTML elements looking for inner strings. - - If element resembled:

testtest

- then string_dig(element, '~') would return test~test - """ - if element.string: - return element.string - else: - return separator.join([string_dig(child) - for child in element.findAll(True)]) - - def flatten(item, prefix='', separator='_', keys=None): """ Flatten nested dictionary into one with its keys concatenated together.