The issue with the below code is “list index out of range error”.
import bs4 import requests my_url = requests.get('play.google.com/store/apps/details? id=com.delta.mobile.android&hl=en_US&showAllReviews=true') uClient = uReq(my_url) page_soup = uClient.read() uClient.close() #Parsing the content soup = BeautifulSoup(page_soup, "html.parser") txt = soup.find('div', class_='review-body').get_text() print(soup.get_text()) temp = pd.DataFrame({'Review Text': txt}, index=[0]) print('-' * 10) #Appending temp values into DataFrame reviews_df.append(temp) #Printing DataFrame print(reviews_df)
Advertisement
Answer
Try:
import urllib , json , requests from bs4 import BeautifulSoup URL='http://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true' USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0" headers = {"user-agent": USER_AGENT} resp = requests.get(URL, headers=headers) soup = BeautifulSoup(resp.content, "html.parser") #print(soup.prettify()) a=[] txt = soup.find_all('script',text=True) for i in txt: if("gp:" in i.text): a.append(i.text) i=a[-1] i=i.split(",null,"") del i[0] for j in i: if('http' not in j): print(j[:j.index(""")]) print()
It worked for me!