Skip to content

Commit

Permalink
Calculate next/prev inside the BeautifulSoup3 adaptor, from the tree …
Browse files Browse the repository at this point in the history
…traversal and offset information.
  • Loading branch information
nostrademons committed Mar 17, 2015
1 parent d4c67b7 commit 042a330
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 3 deletions.
32 changes: 29 additions & 3 deletions python/gumbo/soup_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def _add_source_info(obj, original_text, start_pos, end_pos):
obj.line = start_pos.line
obj.col = start_pos.column
obj.offset = start_pos.offset
obj.end_line = end_pos.line
obj.end_col = end_pos.column
obj.end_offset = end_pos.offset
if end_pos:
obj.end_line = end_pos.line
obj.end_col = end_pos.column
obj.end_offset = end_pos.offset


def _convert_attrs(attrs):
Expand Down Expand Up @@ -69,6 +70,7 @@ def _add_element(soup, element):
def _add_text(cls):
def add_text_internal(soup, element):
text = cls(_utf8(element.text))
_add_source_info(text, element.original_text, element.start_pos, None)
return text
return add_text_internal

Expand All @@ -88,8 +90,32 @@ def _add_node(soup, node):
return _HANDLERS[node.type.value](soup, node.contents)


def _add_next_prev_pointers(soup):
def _traverse(node):
# .findAll requires the .next pointer, which is what we're trying to add
# when we call this, and so we manually supply a generator to yield the
# nodes in DOM order.
yield node
try:
for child in node.contents:
for descendant in _traverse(child):
yield descendant
except AttributeError:
# Not an element.
return

nodes = sorted(_traverse(soup), key=lambda node: node.offset)
if nodes:
nodes[0].previous = None
nodes[-1].next = None
for i, node in enumerate(nodes[1:-1], 1):
nodes[i-1].next = node
node.previous = nodes[i-1]


def parse(text, **kwargs):
with gumboc.parse(text, **kwargs) as output:
soup = BeautifulSoup.BeautifulSoup()
soup.append(_add_node(soup, output.contents.root.contents))
_add_next_prev_pointers(soup)
return soup
4 changes: 4 additions & 0 deletions python/gumbo/soup_adapter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def testSimpleParse(self):
self.assertEquals(head, body.previousSibling)
self.assertEquals(2, len(body)) # <ul> + trailing whitespace
self.assertEquals(u'ul', body.contents[0].name)
self.assertEquals(body, head.next)
self.assertEquals(head, body.previous)

list_items = body.findAll('li')
self.assertEquals(4, len(list_items))
Expand All @@ -53,6 +55,8 @@ def testSimpleParse(self):
a2 = body.find('a', href='two.html')
self.assertEquals(u'a', a2.name)
self.assertEquals(u'Two', a2.contents[0])
self.assertEquals(a2, evens[0].next)
self.assertEquals(evens[0], a2.previous)

li2 = a2.parent
self.assertEquals(u'li', li2.name)
Expand Down

0 comments on commit 042a330

Please sign in to comment.