The problem
I’m trying to parse some blocks of HTML to store the relevant data in a JSON object but I’m struggling with the way BeautifulSoup’s treatment of child tags clashes with my specific requirements.
Eample input:
<p>Here's a paragraph</p> <ul> <li>With a list</li> <li> <ul> <li>And a nested list</li> <li>Within it that has some <strong>bold text</strong></li> </ul> </li> </ul>
Desired output:
[ { "type":"p", "content":"Here's a paragraph" }, { "type":"ul", "content":[ { "type":"li", "content":"With a list" }, { "type":"li", "content":[ { "type":"ul", "content":[ { "type":"li", "content":"And a nested list" }, { "type":"li", "content":"Within it that has some bold text" } ] } ] } ] } ]
My attempt
Here’s my best attempt so far:
from bs4 import BeautifulSoup import json def process(html): content = [] soup = BeautifulSoup(html, 'html.parser') elements = soup.descendants for element in elements: if str(element).strip() not in [' ', '']: if element.name in ['p']:#, 'ul', 'ol', 'li']: content.append({ 'type':element.name, 'content':element.find(text=True, recursive=False) }) elif element.name in ['ul', 'ol']: parent = { 'type':element.name, 'content':[] } for child in element.children: if child != 'n': if child.find(text=True, recursive=False) != 'n': parent['content'].append({ 'type':child.name, 'content':child.find(text=True, recursive=False) }) content.append(parent) print(json.dumps(content, indent=4)) if __name__ == '__main__': original = '''<p>Here's a paragraph</p> <ul> <li>With a list</li> <li> <ul> <li>And a nested list</li> <li>Within it that has some <strong>bold text</strong></li> </ul> </li> </ul> ''' process(original)
Which produces the following output:
[ { "type": "p", "content": "Here's a paragraph" }, { "type": "ul", "content": [ { "type": "li", "content": "With a list" } ] }, { "type": "ul", "content": [ { "type": "li", "content": "And a nested list" }, { "type": "li", "content": "Within it that has some " } ] }, { "type": "ul", "content": [ { "type": "li", "content": "And a nested list" }, { "type": "li", "content": "Within it that has some " } ] } ]
You can see I have three issues:
- The inner list appears twice
- The inner list is not nested within it’s parent list
- The text enclosed within the tags is lost
I know it’s a bit of a bizarre thing to do to HTML, but any suggestions on how to resolve these three points?
Advertisement
Answer
It’s not a beautifulsoup solution – but perhaps it would be easier to use an event-based parser instead such as lxml.etree.iterparse()
You can register for start/end (open tag/close tag) events which can be a useful way of handling the parent/child nesting.
import io, json, lxml.etree def process(html): # convert html str into fileobj for iterparse html = io.BytesIO(html.encode('utf-8')) parser = lxml.etree.iterparse( html, events=('start', 'end'), html=True) root = None parents = [] for event, tag in parser: if event == 'start': content = [] if tag.text and tag.text.strip(): content.append(tag.text.strip()) child = dict(type=tag.tag, content=content) parents.append(child) if not root: root = child else: # close </tag> - point child to parent if len(parents) > 1: parent, child = parents[-2:] parent['content'].append(child) child = parents.pop() content = child['content'] # unwrap 1 element lists that contain a text only node if len(content) == 1 and isinstance(content[0], str): child['content'] = content.pop() # If the previous element is also a text only node # join text together and "discard" the "dict" if len(parent['content']) > 1 and isinstance(parent['content'][-2], str): parent['content'][-2] += ' ' + child['content'] parent['content'].pop() #root = root['content'][0]['content'] print(json.dumps(root, indent=4))
iterparse
adds <html><body>
tags – you can root = root['content'][0]['content']
or so if you want to exclude them.
output:
{ "type": "html", "content": [ { "type": "body", "content": [ { "type": "p", "content": "Here's a paragraph" }, { "type": "ul", "content": [ { "type": "li", "content": "With a list" }, { "type": "li", "content": [ { "type": "ul", "content": [ { "type": "li", "content": "And a nested list" }, { "type": "li", "content": "Within it that has some bold text" } ] } ] } ] } ] } ] }