I am trying to scraping name and email of agents from this site. The code firstly captures link to every profile on first page and then visits each profile to get name and email. But the problem is that it is taking alot of time to get anchor tag having name of agent in it. Here’s the code:
import os from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait class MessageIndividual(webdriver.Chrome): def __init__(self, driver_path=r";C:/SeleniumDriver", teardown=False): self.driver_path = driver_path self.teardown = teardown os.environ['PATH'] += self.driver_path #options = webdriver.ChromeOptions() #options.headless = True super(MessageIndividual, self).__init__() self.implicitly_wait(5) self.maximize_window() def __exit__(self, exc_type, exc_val, exc_tb): if self.teardown: self.quit() def goToSite(self): url = 'https://www.bhhs.com/agent-search-results' self.get(url) def getDetails(self): mylist = [my_elem.get_attribute("href") for my_elem in WebDriverWait(self, 1000).until( EC.visibility_of_all_elements_located((By.XPATH, "//section[@class='cmp-agent-results-list-view']/div[@class='cmp-agent-results-list-view__content container ']/div[@class='row associate pt-3 pb-3 ']/div[@class='col-6 col-sm-4 col-lg-3 order-lg-3 associate__btn-group']/section[2]/a[@href]")))] for i in mylist: self.execute_script("window.open('');") self.switch_to.window(self.window_handles[1]) self.get(i) name = WebDriverWait(self,5).until( EC.presence_of_element_located((By.XPATH,'//h1[@class="cmp-agent__name"]/a[1]')) ) print(name.text) email = WebDriverWait(self,1).until(EC.presence_of_element_located((By.CLASS_NAME,'cmp-agent-details__mail'))) print(email.text) self.close() self.switch_to.window(self.window_handles[0]) if __name__ == '__main__': inst = MessageIndividual(teardown=False) inst.goToSite() inst.getDetails()
Is there any way I can scrape name in lesser time?
Advertisement
Answer
I have change the xpath
to identify the anchor tag and remove the new window open in each iteration. hope this will reduce some time.
def getDetails(self): mylist = [my_elem.get_attribute("href") for my_elem in WebDriverWait(self, 1000).until( EC.visibility_of_all_elements_located((By.XPATH, "//a[.//span[normalize-space(.)='agent details']]")))] for i in mylist: #self.execute_script("window.open('');") #self.switch_to.window(self.window_handles[1]) self.get(i) name = WebDriverWait(self,5).until( EC.presence_of_element_located((By.XPATH,'//h1[@class="cmp-agent__name"]/a[1]')) ) print(name.text) email = WebDriverWait(self,1).until(EC.presence_of_element_located((By.CLASS_NAME,'cmp-agent-details__mail'))) print(email.text) #self.close() #self.switch_to.window(self.window_handles[0])