Looping through pages of search result

I am trying to scrape Reuters image captions on certain pictures. I have searched with my parameters and have a search result with 182 pages. The ‘PN=X’ part at the end of the links are the page numbers. I have built a for loop to loop through the pages and scrape all captions:

pages = ['https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=1', 
'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=2', 
'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=3',
'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=4', ...]

complete_captions = []
for link in pages:
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    for element in soup.find_all(id=re.compile("CaptionLong_Lbl")):
        if not element.text.endswith('...'):
            complete_captions.append(element.text)

JavaScript
​x
 
pages = ['https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=1', 
'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=2', 
'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=3',
'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN=4', ...]
​
complete_captions = []
for link in pages:
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    for element in soup.find_all(id=re.compile("CaptionLong_Lbl")):
        if not element.text.endswith('...'):
            complete_captions.append(element.text)
​

The code runs, but it returns the same captions regardless of the page it is given. It just repeats the same 47 results over and over again. But when I enter the pages into my browser, they are different from each other. So it should give different results. Any idea how to fix?

Answer

For this website, to get different results for each page is more complicated than just adding a page number to the URL and using requests.get().

A simpler approach in this case would be to use selenium, for example:

from bs4 import BeautifulSoup
import re
import time
                             
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

options = Options()
options.headless = True

browser = webdriver.Firefox(options=options)
complete_captions = []

for page_number in range(1, 5):
    print(f"Page {page_number}")
    
    url = f'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN={page_number}'
    browser.get(url)
    time.sleep(1)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    for element in soup.find_all(id=re.compile("CaptionLong_Lbl")):
        if not element.text.endswith('...'):
            complete_captions.append(element.text)
            #print(element.text)
            
browser.quit()

JavaScript
 
from bs4 import BeautifulSoup
import re
import time
                             
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
​
options = Options()
options.headless = True
​
browser = webdriver.Firefox(options=options)
complete_captions = []
​
for page_number in range(1, 5):
    print(f"Page {page_number}")
    
    url = f'https://pictures.reuters.com/CS.aspx?VP3=SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688#/SearchResult&VBID=2C0BXZS52QWLHI&SMLS=1&RW=1920&RH=688&PN={page_number}'
    browser.get(url)
    time.sleep(1)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    
    for element in soup.find_all(id=re.compile("CaptionLong_Lbl")):
        if not element.text.endswith('...'):
            complete_captions.append(element.text)
            #print(element.text)
            
browser.quit()            
​

Obviously, a different browser can be used.

Advertisement

Answer