How can I wrap all BeautifulSoup existing find/select methods in order to add additional logic and parameters?

Question

I have a repetitive sanity-check process I go through with most calls to a BeautifulSoup object where I: Make the function call (.find, .find_all, .select_one, and .select mostly) Check to make sure the element(s) were found If not found, I raise a custom MissingHTMLTagError, stopping the process there. Attempt to retrieve attribute(s) from the element(s) (using .get or getattr) If

Accepted Answer

I believe I&#8217;ve figured out a succinct and reasonable way to accomplish what I&#8217;m looking for with the following wrapper:from bs4 import BeautifulSoupfrom functools import wrapsimport requestsimport inspectimport abcclass HTMLParseError(Exception):    passclass MissingHTMLTagError(Exception):    passclass MissingHTMLAttributeError(Exception):    passclass MyClass(metaclass=abc.ABCMeta):    def __init__(self):        self._sess = requests.Session()        self._sess.hooks["response"].append(self._session_hook)        self._resp = None        self._soup = None    def _session_hook(self, response, *args, **kwargs):        """Implicitly sets private instance variables for seamless state-tracking and less boilerplate code"""        self._resp = response        #if "html" in self._resp.headers["content-type"]:        self.get_soup()    def _wrapped_soup(self, soup):        def soup_wrapper(fn):            @wraps(fn)            def wrapped_soup(*args, **kwargs):                extract = kwargs.pop("extract", None)                if not isinstance(extract, (str, tuple, type(None))):                    raise TypeError("extract can only be of type None, str, or tuple")                elif isinstance(extract, tuple) and len(extract) != 2:                    raise TypeError("extract tuple can only contain two values; key/value pair")                elements = fn(*args, **kwargs)                if not elements:                    raise MissingHTMLTagError()                elif not extract:                    return elements                elif isinstance(elements, list):                    # handle `soup.find_all` and `soup.select`                    if isinstance(extract, str):                        # single attribute for multiple elements                        attribs = list()                        for el in elements:                            # covers element attributes, as well soup properties like `.string`                            el_attrib = el.get(extract) or getattr(el, extract)                            if not el_attrib:                                raise MissingHTMLAttributeError()                            attribs.append(el_attrib)                        return attribs                    else:                        # attribute pairs for multiple elements                        attribs = dict()                        key, value = extract                        for el in elements:                            el_key = el.get(key)                            if el_key is None:                                raise MissingHTMLAttributeError()                            attribs[el_key] = el.get(value, "")                        return attribs                else:                    if isinstance(extract, str):                        # single attribute for single element                        # covers element attributes, as well soup properties like `.string`                        attrib = elements.get(extract) or getattr(el, extract)                        if not attrib:                            raise MissingHTMLAttributeError()                        return attrib                    else:                        # attribute pair for a single element                        key, value = extract                        el_key = elements.get(key)                        if el_key is None:                            raise MissingHTMLAttributeError()                        return {el_key: elements.get(value, "")}            return wrapped_soup        # wrap all methods that start with find or select        applicable_funcs = [            f for f in dir(soup)             if f.startswith("find")             or f.startswith("select")        ]        for func in applicable_funcs:            setattr(soup, func, soup_wrapper(getattr(soup, func)))        return soup    def get_soup(self):        try:            self._soup = self._wrapped_soup(BeautifulSoup(self._resp.text, "html.parser"))        except HTMLParseError:            # since this is implicit we need to fail gracefully            #self.logger.warning("Failed to parse a response whose Content-Type header was set to text/html")            passcls = MyClass()cls._sess.get("https://www.example.com")test = cls._soup.find("a", extract="href")print("test:", test)

Advertisement

Answer