Skip to content
Advertisement

Extracting reviews of Android App from Google Play store using Web Scraping method (Python BS4) – index out of range

The issue with the below code is “list index out of range error”.

import bs4
import requests
my_url = requests.get('play.google.com/store/apps/details? 
id=com.delta.mobile.android&hl=en_US&showAllReviews=true') 
uClient = uReq(my_url) 
page_soup = uClient.read() 
uClient.close() 
#Parsing the content 
soup = BeautifulSoup(page_soup, "html.parser") 
txt = soup.find('div', class_='review-body').get_text() 
print(soup.get_text()) 
temp = pd.DataFrame({'Review Text': txt}, index=[0]) 
print('-' * 10) 
#Appending temp values into DataFrame 
reviews_df.append(temp) 
#Printing DataFrame 
print(reviews_df)

Advertisement

Answer

Try:

import urllib , json , requests
from bs4 import BeautifulSoup
URL='http://play.google.com/store/apps/details?id=com.delta.mobile.android&hl=en_US&showAllReviews=true'
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
headers = {"user-agent": USER_AGENT}
resp = requests.get(URL, headers=headers)
soup = BeautifulSoup(resp.content, "html.parser")
#print(soup.prettify())
a=[]
txt = soup.find_all('script',text=True)
for i in txt:
    if("gp:" in i.text):
        a.append(i.text)
i=a[-1]
i=i.split(",null,"")
del i[0]
for j in i:
    if('http' not in j):
        print(j[:j.index(""")])
        print()

It worked for me!

Advertisement