Skip to content
Advertisement

Extract HTML into JSON with pyhton BeautifulSoup

The problem

I’m trying to parse some blocks of HTML to store the relevant data in a JSON object but I’m struggling with the way BeautifulSoup’s treatment of child tags clashes with my specific requirements.

Eample input:

<p>Here's a paragraph</p>
<ul>
    <li>With a list</li>
    <li>
        <ul>
            <li>And a nested list</li>
            <li>Within it that has some <strong>bold text</strong></li>
        </ul>
    </li>
</ul>

Desired output:

[
    {
        "type":"p",
        "content":"Here's a paragraph"
    },
    {
        "type":"ul",
        "content":[
            {
                "type":"li",
                "content":"With a list"
            },
            {
                "type":"li",
                "content":[
                    {
                        "type":"ul",
                        "content":[
                            {
                                "type":"li",
                                "content":"And a nested list"
                            },
                            {
                                "type":"li",
                                "content":"Within it that has some bold text"
                            }
                        ]
                    }
                ]
            }
        ]
    }
]

My attempt

Here’s my best attempt so far:

from bs4 import BeautifulSoup
import json

def process(html):
    content = []
    soup = BeautifulSoup(html, 'html.parser')
    elements = soup.descendants
    for element in elements:
        if str(element).strip() not in [' ', '']:
            if element.name in ['p']:#, 'ul', 'ol', 'li']:
                content.append({
                    'type':element.name,
                    'content':element.find(text=True, recursive=False)
                })
            elif element.name in ['ul', 'ol']:
                parent = {
                    'type':element.name,
                    'content':[]
                }
                for child in element.children:
                    if child != 'n':
                        if child.find(text=True, recursive=False) != 'n':
                            parent['content'].append({
                                'type':child.name,
                                'content':child.find(text=True, recursive=False)
                            })
                            content.append(parent)
    print(json.dumps(content, indent=4))

if __name__ == '__main__':
    original = '''<p>Here's a paragraph</p>
<ul>
    <li>With a list</li>
    <li>
        <ul>
            <li>And a nested list</li>
            <li>Within it that has some <strong>bold text</strong></li>
        </ul>
    </li>
</ul>
'''
    process(original)

Which produces the following output:

[
    {
        "type": "p",
        "content": "Here's a paragraph"
    },
    {
        "type": "ul",
        "content": [
            {
                "type": "li",
                "content": "With a list"
            }
        ]
    },
    {
        "type": "ul",
        "content": [
            {
                "type": "li",
                "content": "And a nested list"
            },
            {
                "type": "li",
                "content": "Within it that has some "
            }
        ]
    },
    {
        "type": "ul",
        "content": [
            {
                "type": "li",
                "content": "And a nested list"
            },
            {
                "type": "li",
                "content": "Within it that has some "
            }
        ]
    }
]

You can see I have three issues:

  1. The inner list appears twice
  2. The inner list is not nested within it’s parent list
  3. The text enclosed within the tags is lost

I know it’s a bit of a bizarre thing to do to HTML, but any suggestions on how to resolve these three points?

Advertisement

Answer

It’s not a beautifulsoup solution – but perhaps it would be easier to use an event-based parser instead such as lxml.etree.iterparse()

You can register for start/end (open tag/close tag) events which can be a useful way of handling the parent/child nesting.

import io, json, lxml.etree

def process(html):
    # convert html str into fileobj for iterparse
    html = io.BytesIO(html.encode('utf-8'))

    parser = lxml.etree.iterparse(
        html, events=('start', 'end'), html=True)

    root = None
    parents = []

    for event, tag in parser:
        if event == 'start':
            content = []
            if tag.text and tag.text.strip():
                content.append(tag.text.strip())
            child = dict(type=tag.tag, content=content)
            parents.append(child)
            if not root:
                root = child
        else: 
            # close </tag> - point child to parent
            if len(parents) > 1:
                parent, child = parents[-2:]
                parent['content'].append(child)

            child = parents.pop()
            content = child['content']
            # unwrap 1 element lists that contain a text only node
            if len(content) == 1 and isinstance(content[0], str):
                child['content'] = content.pop()
                # If the previous element is also a text only node
                # join text together and "discard" the "dict"
                if len(parent['content']) > 1 and 
                        isinstance(parent['content'][-2], str):
                    parent['content'][-2] += ' ' + child['content']
                    parent['content'].pop()

    #root = root['content'][0]['content']
    print(json.dumps(root, indent=4))

iterparse adds <html><body> tags – you can root = root['content'][0]['content'] or so if you want to exclude them.

output:

{
    "type": "html",
    "content": [
        {
            "type": "body",
            "content": [
                {
                    "type": "p",
                    "content": "Here's a paragraph"
                },
                {
                    "type": "ul",
                    "content": [
                        {
                            "type": "li",
                            "content": "With a list"
                        },
                        {
                            "type": "li",
                            "content": [
                                {
                                    "type": "ul",
                                    "content": [
                                        {
                                            "type": "li",
                                            "content": "And a nested list"
                                        },
                                        {
                                            "type": "li",
                                            "content": "Within it that has some bold text"
                                        }
                                    ]
                                }
                            ]
                        }
                    ]
                }
            ]
        }
    ]
}
User contributions licensed under: CC BY-SA
10 People found this is helpful
Advertisement