The problem
I’m trying to parse some blocks of HTML to store the relevant data in a JSON object but I’m struggling with the way BeautifulSoup’s treatment of child tags clashes with my specific requirements.
Eample input:
<p>Here's a paragraph</p>
<ul>
<li>With a list</li>
<li>
<ul>
<li>And a nested list</li>
<li>Within it that has some <strong>bold text</strong></li>
</ul>
</li>
</ul>
Desired output:
[
{
"type":"p",
"content":"Here's a paragraph"
},
{
"type":"ul",
"content":[
{
"type":"li",
"content":"With a list"
},
{
"type":"li",
"content":[
{
"type":"ul",
"content":[
{
"type":"li",
"content":"And a nested list"
},
{
"type":"li",
"content":"Within it that has some bold text"
}
]
}
]
}
]
}
]
My attempt
Here’s my best attempt so far:
from bs4 import BeautifulSoup
import json
def process(html):
content = []
soup = BeautifulSoup(html, 'html.parser')
elements = soup.descendants
for element in elements:
if str(element).strip() not in [' ', '']:
if element.name in ['p']:#, 'ul', 'ol', 'li']:
content.append({
'type':element.name,
'content':element.find(text=True, recursive=False)
})
elif element.name in ['ul', 'ol']:
parent = {
'type':element.name,
'content':[]
}
for child in element.children:
if child != 'n':
if child.find(text=True, recursive=False) != 'n':
parent['content'].append({
'type':child.name,
'content':child.find(text=True, recursive=False)
})
content.append(parent)
print(json.dumps(content, indent=4))
if __name__ == '__main__':
original = '''<p>Here's a paragraph</p>
<ul>
<li>With a list</li>
<li>
<ul>
<li>And a nested list</li>
<li>Within it that has some <strong>bold text</strong></li>
</ul>
</li>
</ul>
'''
process(original)
Which produces the following output:
[
{
"type": "p",
"content": "Here's a paragraph"
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "With a list"
}
]
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some "
}
]
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some "
}
]
}
]
You can see I have three issues:
- The inner list appears twice
- The inner list is not nested within it’s parent list
- The text enclosed within the tags is lost
I know it’s a bit of a bizarre thing to do to HTML, but any suggestions on how to resolve these three points?
Advertisement
Answer
It’s not a beautifulsoup solution – but perhaps it would be easier to use an event-based parser instead such as lxml.etree.iterparse()
You can register for start/end (open tag/close tag) events which can be a useful way of handling the parent/child nesting.
import io, json, lxml.etree
def process(html):
# convert html str into fileobj for iterparse
html = io.BytesIO(html.encode('utf-8'))
parser = lxml.etree.iterparse(
html, events=('start', 'end'), html=True)
root = None
parents = []
for event, tag in parser:
if event == 'start':
content = []
if tag.text and tag.text.strip():
content.append(tag.text.strip())
child = dict(type=tag.tag, content=content)
parents.append(child)
if not root:
root = child
else:
# close </tag> - point child to parent
if len(parents) > 1:
parent, child = parents[-2:]
parent['content'].append(child)
child = parents.pop()
content = child['content']
# unwrap 1 element lists that contain a text only node
if len(content) == 1 and isinstance(content[0], str):
child['content'] = content.pop()
# If the previous element is also a text only node
# join text together and "discard" the "dict"
if len(parent['content']) > 1 and
isinstance(parent['content'][-2], str):
parent['content'][-2] += ' ' + child['content']
parent['content'].pop()
#root = root['content'][0]['content']
print(json.dumps(root, indent=4))
iterparse
adds <html><body>
tags – you can root = root['content'][0]['content']
or so if you want to exclude them.
output:
{
"type": "html",
"content": [
{
"type": "body",
"content": [
{
"type": "p",
"content": "Here's a paragraph"
},
{
"type": "ul",
"content": [
{
"type": "li",
"content": "With a list"
},
{
"type": "li",
"content": [
{
"type": "ul",
"content": [
{
"type": "li",
"content": "And a nested list"
},
{
"type": "li",
"content": "Within it that has some bold text"
}
]
}
]
}
]
}
]
}
]
}