diff --git a/saucebrush/sources.py b/saucebrush/sources.py index c0d925b..a3d272b 100644 --- a/saucebrush/sources.py +++ b/saucebrush/sources.py @@ -4,8 +4,9 @@ All sources must implement the iterable interface and return python dictionaries. """ - +from __future__ import unicode_literals import string + from saucebrush import utils class CSVSource(object): @@ -95,7 +96,7 @@ class HtmlTableSource(object): soup = BeautifulSoup(htmlfile.read()) if isinstance(id_or_num, int): table = soup.findAll('table')[id_or_num] - elif isinstance(id_or_num, str): + else: table = soup.find('table', id=id_or_num) # skip the necessary number of rows @@ -105,9 +106,13 @@ class HtmlTableSource(object): if not fieldnames: self._fieldnames = [td.string for td in self._rows[0].findAll(('td','th'))] + skiprows += 1 else: self._fieldnames = fieldnames + # skip the necessary number of rows + self._rows = table.findAll('tr')[skiprows:] + def process_tr(self): for row in self._rows: strings = [utils.string_dig(td) for td in row.findAll('td')] diff --git a/saucebrush/tests/sources.py b/saucebrush/tests/sources.py index 82c7086..f3a07dc 100644 --- a/saucebrush/tests/sources.py +++ b/saucebrush/tests/sources.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from io import BytesIO, StringIO import unittest -from saucebrush.sources import CSVSource, FixedWidthFileSource +from saucebrush.sources import CSVSource, FixedWidthFileSource, HtmlTableSource class SourceTestCase(unittest.TestCase): @@ -44,5 +44,34 @@ class SourceTestCase(unittest.TestCase): 'year':'1999'}] self.assertEqual(list(source), expected_data) + def test_html_table_source(self): + + content = StringIO(""" + +
a | +b | +c | +
---|---|---|
1 | +2 | +3 | +