switch from BeautifulSoup to lxml

2012-03-12 16:09:15 -07:00 · 2012-03-12 16:09:15 -07:00 · e0e4dd43e2
commit e0e4dd43e2
parent c7544204d8
3 changed files with 13 additions and 26 deletions
--- a/saucebrush/sources.py
+++ b/saucebrush/sources.py
@ -92,30 +92,32 @@ class HtmlTableSource(object):
    def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):

        # extract the table
-        from BeautifulSoup import BeautifulSoup
-        soup = BeautifulSoup(htmlfile.read())
+        from lxml.html import parse
+        doc = parse(htmlfile).getroot()
        if isinstance(id_or_num, int):
-            table = soup.findAll('table')[id_or_num]
+            table = doc.cssselect('table')[id_or_num]
        else:
-            table = soup.find('table', id=id_or_num)
+            table = doc.cssselect('table#%s' % id_or_num)
+
+        table = table[0] # get the first table

        # skip the necessary number of rows
-        self._rows = table.findAll('tr')[skiprows:]
+        self._rows = table.cssselect('tr')[skiprows:]

        # determine the fieldnames
        if not fieldnames:
-            self._fieldnames = [td.string
-                                for td in self._rows[0].findAll(('td','th'))]
+            self._fieldnames = [td.text_content()
+                                for td in self._rows[0].cssselect('td, th')]
            skiprows += 1
        else:
            self._fieldnames = fieldnames

        # skip the necessary number of rows
-        self._rows = table.findAll('tr')[skiprows:]
+        self._rows = table.cssselect('tr')[skiprows:]

    def process_tr(self):
        for row in self._rows:
-            strings = [utils.string_dig(td) for td in row.findAll('td')]
+            strings = [td.text_content() for td in row.cssselect('td')]
            yield dict(zip(self._fieldnames, strings))

    def __iter__(self):
--- a/saucebrush/tests/sources.py
+++ b/saucebrush/tests/sources.py
@ -65,13 +65,13 @@ class SourceTestCase(unittest.TestCase):

        try:

-            from BeautifulSoup import BeautifulSoup
+            import lxml

            hts = HtmlTableSource(content, 'thetable')
            self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}])

        except ImportError:
-            self.skipTest("BeautifulSoup is not installed")
+            self.skipTest("lxml is not installed")

 if __name__ == '__main__':
    unittest.main()
--- a/saucebrush/utils.py
+++ b/saucebrush/utils.py
@ -25,21 +25,6 @@ def get_django_model(dj_settings, app_label, model_name):
    from django.db.models import get_model
    return get_model(app_label, model_name)

-
-def string_dig(element, separator=''):
-    """
-        Dig into BeautifulSoup HTML elements looking for inner strings.
-
-        If element resembled: <p><b>test</b><em>test</em></p>
-        then string_dig(element, '~') would return test~test
-    """
-    if element.string:
-        return element.string
-    else:
-        return separator.join([string_dig(child)
-                            for child in element.findAll(True)])
-
-
 def flatten(item, prefix='', separator='_', keys=None):
    """
        Flatten nested dictionary into one with its keys concatenated together.