From e0e4dd43e226c959469a81c32a906cb1c39dda7f Mon Sep 17 00:00:00 2001
From: Jeremy Carbaugh <jcarbaugh@gmail.com>
Date: Mon, 12 Mar 2012 16:09:15 -0700
Subject: [PATCH] switch from BeautifulSoup to lxml

---
 saucebrush/sources.py       | 20 +++++++++++---------
 saucebrush/tests/sources.py |  4 ++--
 saucebrush/utils.py         | 15 ---------------
 3 files changed, 13 insertions(+), 26 deletions(-)
diff --git a/saucebrush/sources.py b/saucebrush/sources.py
index a3d272b..9ebe532 100644
--- a/saucebrush/sources.py
+++ b/saucebrush/sources.py
@@ -92,30 +92,32 @@ class HtmlTableSource(object):
     def __init__(self, htmlfile, id_or_num, fieldnames=None, skiprows=0):
 
         # extract the table
-        from BeautifulSoup import BeautifulSoup
-        soup = BeautifulSoup(htmlfile.read())
+        from lxml.html import parse
+        doc = parse(htmlfile).getroot()
         if isinstance(id_or_num, int):
-            table = soup.findAll('table')[id_or_num]
+            table = doc.cssselect('table')[id_or_num]
         else:
-            table = soup.find('table', id=id_or_num)
+            table = doc.cssselect('table#%s' % id_or_num)
+
+        table = table[0] # get the first table
 
         # skip the necessary number of rows
-        self._rows = table.findAll('tr')[skiprows:]
+        self._rows = table.cssselect('tr')[skiprows:]
 
         # determine the fieldnames
         if not fieldnames:
-            self._fieldnames = [td.string
-                                for td in self._rows[0].findAll(('td','th'))]
+            self._fieldnames = [td.text_content()
+                                for td in self._rows[0].cssselect('td, th')]
             skiprows += 1
         else:
             self._fieldnames = fieldnames
 
         # skip the necessary number of rows
-        self._rows = table.findAll('tr')[skiprows:]
+        self._rows = table.cssselect('tr')[skiprows:]
 
     def process_tr(self):
         for row in self._rows:
-            strings = [utils.string_dig(td) for td in row.findAll('td')]
+            strings = [td.text_content() for td in row.cssselect('td')]
             yield dict(zip(self._fieldnames, strings))
 
     def __iter__(self):
diff --git a/saucebrush/tests/sources.py b/saucebrush/tests/sources.py
index f3a07dc..1db434c 100644
--- a/saucebrush/tests/sources.py
+++ b/saucebrush/tests/sources.py
@@ -65,13 +65,13 @@ class SourceTestCase(unittest.TestCase):
 
         try:
 
-            from BeautifulSoup import BeautifulSoup
+            import lxml
 
             hts = HtmlTableSource(content, 'thetable')
             self.assertEqual(list(hts), [{'a': '1', 'b': '2', 'c': '3'}])
 
         except ImportError:
-            self.skipTest("BeautifulSoup is not installed")
+            self.skipTest("lxml is not installed")
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/saucebrush/utils.py b/saucebrush/utils.py
index afa67dc..e30d4ef 100644
--- a/saucebrush/utils.py
+++ b/saucebrush/utils.py
@@ -25,21 +25,6 @@ def get_django_model(dj_settings, app_label, model_name):
     from django.db.models import get_model
     return get_model(app_label, model_name)
 
-
-def string_dig(element, separator=''):
-    """
-        Dig into BeautifulSoup HTML elements looking for inner strings.
-
-        If element resembled: <p><b>test</b><em>test</em></p>
-        then string_dig(element, '~') would return test~test
-    """
-    if element.string:
-        return element.string
-    else:
-        return separator.join([string_dig(child)
-                            for child in element.findAll(True)])
-
-
 def flatten(item, prefix='', separator='_', keys=None):
     """
         Flatten nested dictionary into one with its keys concatenated together.