forked from google/gumbo-parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
soup_adapter.py
121 lines (95 loc) · 3.49 KB
/
soup_adapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Copyright 2012 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Adapter between Gumbo and BeautifulSoup.
This parses an HTML document and gives back a BeautifulSoup object, which you
can then manipulate like a normal BeautifulSoup parse tree.
"""
__author__ = 'jdtang@google.com (Jonathan Tang)'
import BeautifulSoup
import gumboc
def _utf8(text):
return text.decode('utf-8', 'replace')
def _add_source_info(obj, original_text, start_pos, end_pos):
obj.original = str(original_text)
obj.line = start_pos.line
obj.col = start_pos.column
obj.offset = start_pos.offset
if end_pos:
obj.end_line = end_pos.line
obj.end_col = end_pos.column
obj.end_offset = end_pos.offset
def _convert_attrs(attrs):
# TODO(jdtang): Ideally attributes would pass along their positions as well,
# but I can't extend the built in str objects with new attributes. Maybe work
# around this with a subclass in some way...
return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
def _add_document(soup, element):
# Currently ignored, since there's no real place for this in the BeautifulSoup
# API.
pass
def _add_element(soup, element):
# TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
# BeautifulSoup.
tag = BeautifulSoup.Tag(
soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
for child in element.children:
tag.append(_add_node(soup, child))
_add_source_info(
tag, element.original_tag, element.start_pos, element.end_pos)
tag.original_end_tag = str(element.original_end_tag)
return tag
def _add_text(cls):
def add_text_internal(soup, element):
text = cls(_utf8(element.text))
_add_source_info(text, element.original_text, element.start_pos, None)
return text
return add_text_internal
_HANDLERS = [
_add_document,
_add_element,
_add_text(BeautifulSoup.NavigableString),
_add_text(BeautifulSoup.CData),
_add_text(BeautifulSoup.Comment),
_add_text(BeautifulSoup.NavigableString),
_add_element,
]
def _add_node(soup, node):
return _HANDLERS[node.type.value](soup, node.contents)
def _add_next_prev_pointers(soup):
def _traverse(node):
# .findAll requires the .next pointer, which is what we're trying to add
# when we call this, and so we manually supply a generator to yield the
# nodes in DOM order.
yield node
try:
for child in node.contents:
for descendant in _traverse(child):
yield descendant
except AttributeError:
# Not an element.
return
nodes = sorted(_traverse(soup), key=lambda node: node.offset)
if nodes:
nodes[0].previous = None
nodes[-1].next = None
for i, node in enumerate(nodes[1:-1], 1):
nodes[i-1].next = node
node.previous = nodes[i-1]
def parse(text, **kwargs):
with gumboc.parse(text, **kwargs) as output:
soup = BeautifulSoup.BeautifulSoup()
soup.append(_add_node(soup, output.contents.root.contents))
_add_next_prev_pointers(soup)
return soup