I have a repetitive sanity-check process I go through with most calls to a BeautifulSoup object where I:
- Make the function call (
.find
,.find_all
,.select_one
, and.select
mostly) - Check to make sure the element(s) were found
- If not found, I raise a custom
MissingHTMLTagError
, stopping the process there.
- If not found, I raise a custom
- Attempt to retrieve attribute(s) from the element(s) (using
.get
orgetattr
)- If not found, I raise a custom
MissingHTMLAttributeError
- If not found, I raise a custom
- Return either a:
- string, when it’s a single attribute of a single element (
.find
and.select_one
) - list of strings, when it’s a single attribute of multiple elements (
.find_all
and.select
) - dict, when it’s two attributes (key/value pairs) for multiple elements (
.find_all
and.select
)
- string, when it’s a single attribute of a single element (
I’ve created the below solution that acts as a proxy (not-so-elegantly) to BeautifulSoup methods. But, I’m hoping there is an easier eay to accomplish this. Basically, I want to be able to patch all the BeautifulSoup methods to:
- Allow for an extra parameter to be passed, so that the above steps are taken care off in a single call
- If using any of the above methods without providing the extra parameter I want to return the BeautifulSoup objects like normal or raise the
MissingHTMLTagError
if the return value is None or an empty list.
Most of the time the below function is used with a class variable (self._soup
), which is just a BeautifulSoup object of the most-recent requests.Response
.
from bs4 import BeautifulSoup def get_html_value(self, element, attribute=None, soup=None, func="find", **kwargs): """A one-step method to return html element attributes. A proxy function that handles passing parameters to BeautifulSoup object instances while reducing the amount of boilerplate code needed to get an element, validate its existence, then do the same for the attribute of that element. All while managing raising proper exceptions for debugging. **Examples:** # Get a single attribute from a single element using BeautifulSoup.find >> self.get_html_value("a", "href", attrs={"class": "report-list"}) >> "example.com/page" # Get a single attribute from multiple elements using using BeautifulSoup.find_all >> self.get_html_value("a", "href", func="find_all", attrs={"class": "top-nav-link"}) >> ["example.com/category1", "example.com/category2", "example.com/category3"] # Getting key/value pairs (representing hidden input fields for POST requests) # from a fragment of the full html page (login_form) that only contains the form controls >> self.get_html_value("input", ("name", "value"), soup=login_form, func="find_all", attrs={"type": "hidden"}) >> {"csrf_token": "a1b23c456def", "viewstate": "wxyzqwerty"} # Find an element based on one of its parents using func="select_one" >> account_balance = self.get_html_value("div#account-details > section > h1", func="select_one") >> account_balance.string >> "$12,345.67" # Using func="select" with no attribute will return BeautifulSoup objects >> self.get_html_value("div#accounts > div a", func="select") >> [<a href="...">Act. 1</a>, <a href="...">Act. 2</a>, <a href="...">Act. 3</a>] # Using func="select" with attribute will return list of values >> self.get_html_value("div#accounts > div a", attribute="href", func="select") >> ["example.com/account1", "example.com/account2", "example.com/account3"] """ if not any([soup, self._soup]): raise ValueError("Class property soup not set and soup parameter not provided") elif soup: # provide parsing for strings and requests.Responses if isinstance(soup, str): soup = BeautifulSoup(soup, "html.parser") elif isinstance(soup, requests.Response): soup = BeautifulSoup(soup.text, "html.parser") else: soup = self._soup if not isinstance(attribute, (str, tuple)): raise TypeError("attribute can only be a string or a tuple") if isinstance(attribute, tuple) and len(attribute) != 2: raise ValueError("attribute can only be a string or tuple of 2 strings (key/value pairing)") bs_func = getattr(soup, func) if not bs_func: raise AttributeError("Method %s not found in the BeautifulSoup package" % func) bs_element = bs_func(element, **kwargs) if kwargs else bs_func(element) if not bs_element: raise MissingHtmlError(self, element, None, soup, func, kwargs) if attribute: if isinstance(attribute, str): # handle soup.find and soup.select_one if isinstance(bs_element, list): # single attribute for multiple elements bs_attributes = [] for el in bs_element: el_attribute = el.get(attribute) if not el_attribute: raise MissingHtmlError(self, element, attribute, soup, func, kwargs) bs_attributes.append(el_attribute) return bs_attributes else: # single attribute for single element bs_attribute = bs_element.get(attribute) if not bs_attribute: raise MissingHtmlError(self, element, attribute, soup, func, kwargs) return bs_attribute else: # handle soup.find_all and soup.select key, value = attribute if isinstance(bs_element, list): # attribute pairs for multiple elements bs_attributes = {} for el in bs_element: el_key = el.get(key) if el_key is None: raise MissingHtmlError(self, element, attribute, soup, func, kwargs) bs_attributes[el_key] = el.get(value, "") return bs_attributes else: # attribute pair for a single element el_key = bs_element.get(key) if el_key is None: raise MissingHtmlError(self, element, attribute, soup, func, kwargs) return {el_key: bs_element.get(value, "")} # no attribute was provided, so return the requested element(s) return bs_element
Is there anyway to wrap all of the exposed .find
and .select
-type methods of BeautifulSoup, so I can still use the methods normally (ex: soup.find()
) instead of having to use my workaround function?
Advertisement
Answer
I believe I’ve figured out a succinct and reasonable way to accomplish what I’m looking for with the following wrapper:
from bs4 import BeautifulSoup from functools import wraps import requests import inspect import abc class HTMLParseError(Exception): pass class MissingHTMLTagError(Exception): pass class MissingHTMLAttributeError(Exception): pass class MyClass(metaclass=abc.ABCMeta): def __init__(self): self._sess = requests.Session() self._sess.hooks["response"].append(self._session_hook) self._resp = None self._soup = None def _session_hook(self, response, *args, **kwargs): """Implicitly sets private instance variables for seamless state-tracking and less boilerplate code""" self._resp = response #if "html" in self._resp.headers["content-type"]: self.get_soup() def _wrapped_soup(self, soup): def soup_wrapper(fn): @wraps(fn) def wrapped_soup(*args, **kwargs): extract = kwargs.pop("extract", None) if not isinstance(extract, (str, tuple, type(None))): raise TypeError("extract can only be of type None, str, or tuple") elif isinstance(extract, tuple) and len(extract) != 2: raise TypeError("extract tuple can only contain two values; key/value pair") elements = fn(*args, **kwargs) if not elements: raise MissingHTMLTagError() elif not extract: return elements elif isinstance(elements, list): # handle `soup.find_all` and `soup.select` if isinstance(extract, str): # single attribute for multiple elements attribs = list() for el in elements: # covers element attributes, as well soup properties like `.string` el_attrib = el.get(extract) or getattr(el, extract) if not el_attrib: raise MissingHTMLAttributeError() attribs.append(el_attrib) return attribs else: # attribute pairs for multiple elements attribs = dict() key, value = extract for el in elements: el_key = el.get(key) if el_key is None: raise MissingHTMLAttributeError() attribs[el_key] = el.get(value, "") return attribs else: if isinstance(extract, str): # single attribute for single element # covers element attributes, as well soup properties like `.string` attrib = elements.get(extract) or getattr(el, extract) if not attrib: raise MissingHTMLAttributeError() return attrib else: # attribute pair for a single element key, value = extract el_key = elements.get(key) if el_key is None: raise MissingHTMLAttributeError() return {el_key: elements.get(value, "")} return wrapped_soup # wrap all methods that start with find or select applicable_funcs = [ f for f in dir(soup) if f.startswith("find") or f.startswith("select") ] for func in applicable_funcs: setattr(soup, func, soup_wrapper(getattr(soup, func))) return soup def get_soup(self): try: self._soup = self._wrapped_soup(BeautifulSoup(self._resp.text, "html.parser")) except HTMLParseError: # since this is implicit we need to fail gracefully #self.logger.warning("Failed to parse a response whose Content-Type header was set to text/html") pass cls = MyClass() cls._sess.get("https://www.example.com") test = cls._soup.find("a", extract="href") print("test:", test)