Accessing the contents on links provided on a webpage while webscraping

Question

This is a followup question of my previous question. I am trying to access the contents of a webpage. I could search for contents on the webpage. However, I am not sure how to access the contents in links given on the webpage. For instance, the first line of the search result for id 1.1.1.1 is 36EUL/ADL_7 1.1.1.1 spectrophotometry ....

Accepted Answer

All can be done using Requests and BeautifulSoup without Selenium. Here code how to get data with details:import requestsfrom bs4 import BeautifulSoupbase_url = 'https://randr.nist.gov'ec_name = 'enzyme'search_term = '1.1.1.1'url = f'{base_url}/{ec_name}/'with requests.Session() as session:    # get __VIEWSTATE, __VIEWSTATEGENERATOR, __EVENTVALIDATION parameters to use them in POST parameters    response = session.get(url)    page = BeautifulSoup(response.text, "html.parser")    view_state = page.find(id="__VIEWSTATE")["value"]    view_state_generator = page.find(id="__VIEWSTATEGENERATOR")["value"]    event_validation = page.find(id="__EVENTVALIDATION")["value"]    data = {        '__EVENTTARGET': '',        '__EVENTARGUMENT': '',        '__LASTFOCUS': '',        '__VIEWSTATE': view_state,        '__VIEWSTATEGENERATOR': view_state_generator,        '__SCROLLPOSITIONX': '0',        '__SCROLLPOSITIONY': '0',        '__EVENTVALIDATION': event_validation,        'ctl00$MainBody$txtSrchAutoFill': search_term,        'ctl00$MainBody$repoList': 'Enzyme_thermo',        'ctl00$MainBody$ImgSrch.x': '0',        'ctl00$MainBody$ImgSrch.y': '0'    }    response = session.post(url, data=data)    page = BeautifulSoup(response.text, "html.parser")    # get all rows    rows = page.select("#MainBody_gvSearch tr")    # first row is header, remove it    rows.remove(rows[0])    for row in rows:        reference_id = row.select_one("[id*='lbSearch']").text.strip()        ec_number = row.select_one("[id*='lblECNumber']").text.strip()        method = row.select_one("[id*='lblMethod']").text.strip()        buffer = row.select_one("[id*='lblBuffer']").text.strip()        reaction = row.select_one("[id*='lblReaction']").text.strip()        enzyme = row.select_one("[id*='lblEnzyme']").text.strip()        cofactor = row.select_one("[id*='lblCofactor']").text.strip()        evaluation = row.select_one("[id*='lblEvaluation']").text.strip()        print(f"EC Number: {ec_number}, Reference Id: {reference_id}, Evaluation: {evaluation}")        # get details        params = (            ('ID', reference_id),            ('finalterm', search_term),            ('data', ec_name),        )        response = session.get('https://randr.nist.gov/enzyme/DataDetails.aspx', params=params)        page = BeautifulSoup(response.text, "html.parser")        # parse general information        if page.find("span", text='Reference:'):            reference = page.find("span", text='Reference:').find_parent("td").find_next_sibling("td").text.strip()        if page.find("span", text='pH:'):            ph = page.find("span", text='pH:').find_parent("td").find_next_sibling("td").text.strip()        # parse table        extra_data = []        try:            table_headers = [x.text.strip() for x in page.select("#MainBody_extraData th")]            table_data = [x.text.strip() for x in page.select("#MainBody_extraData td")]            headers_count = len(table_headers)            for i in range(0, len(table_data), headers_count):                row = {}                row_data = table_data[i:i + headers_count]                for column_index, h in enumerate(table_headers):                    row[h] = row_data[column_index]                print("T(K): {}, pH: {}, K': {}".format(row["T(K)"], row["pH"], row["K'"]))                extra_data.append(row)        except Exception as ex:            print("No details table found")            print(ex)        print("")Output of some values:  EC Number:  1.1.1.1, Reference Id: 36EUL/ADL_7, Evaluation: C  T(K): 298.15, pH: 6.4, K&#8217;: 1.3E-5  T(K): 298.15, pH: 7.0, K&#8217;: 5.3E-5  T(K): 298.15, pH: 7.7, K&#8217;: 1.3E-4    EC Number:  1.1.1.1, Reference Id: 37ADL/SRE_8, Evaluation: D  T(K): 298.15, pH: 6.05, K&#8217;: 6.0E-6  T(K): 298.15, pH: 7.25, K&#8217;: 7.7E-5  T(K): 298.15, pH: 8.0, K&#8217;: 1.2E-5    EC Number:  1.1.1.1, Reference Id: 37NEG/WUL_9, Evaluation: C  T(K): 293.15, pH: 7.9, K&#8217;: 7.41E-4    EC Number:  1.1.1.1, Reference Id: 38SCH/HEL_10, Evaluation: C  T(K): 298.15, pH: 6.30, K&#8217;: 2.6E-5  T(K): 298.15, pH: 6.85, K&#8217;: 8.8E-5  T(K): 298.15, pH: 7.15, K&#8217;: 1.9E-4  T(K): 298.15, pH: 7.34, K&#8217;: 3.0E-4  T(K): 298.15, pH: 7.61, K&#8217;: 5.1E-4  T(K): 298.15, pH: 7.77, K&#8217;: 8.0E-4  T(K): 298.15, pH: 8.17, K&#8217;: 2.2E-3    EC Number:  1.1.1.1, Reference Id: 38SCH/HEL_23, Evaluation: C  T(K): 298.15, pH: 6.39, K&#8217;: 9.1E-6  T(K): 298.15, pH: 6.60, K&#8217;: 3.0E-5  T(K): 298.15, pH: 6.85, K&#8217;: 5.1E-5  T(K): 298.15, pH: 7.18, K&#8217;: 1.5E-4  T(K): 298.15, pH: 7.31, K&#8217;: 2.3E-4  T(K): 298.15, pH: 7.69, K&#8217;: 5.6E-4  T(K): 298.15, pH: 8.06, K&#8217;: 1.1E-3

Advertisement

Answer