I was trying to make a crawler to follow links, with this code
import scrapy import time import requests import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import json class DicionarioSpider(scrapy.Spider): name = 'dicionario' allowed_domains = ['www.mediktor.com'] start_urls = ['http://www.mediktor.com/'] def start_requests(self): url = "https://www.mediktor.com/pt-br/glossario" options = Options() options.headless = True driver = webdriver.Chrome(options=options) driver.get(url) time.sleep(10) doencas = driver.find_elements( By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']") for doenca in doencas: url = doenca.get_attribute('href') yield scrapy.Request(url) driver.quit() def parse(self, response): urls = response.css( '.mdk-dictionary-list__glossary-item a::attr(href)') for url in urls: yield response.follow(url.get(), callback=self.parse_info) def parse_info(self, response): contents = response.css('div.page-glossary-detail__main-content') for desc in response.css('div.mdk-conclusion-detail__main-description'): desc = response.css('p ::text').getall() yield { 'desc': desc } for content in contents: yield{ 'name': content.css( 'div.mdk-conclusion-detail__main-title ::text').get().strip(), 'espec': content.css( 'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip() }
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
import pandas as pd import requests from bs4 import BeautifulSoup def get_auth_code(): url = "https://www.mediktor.com/vendor.js" response = requests.get(url) start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"') end_index = response.text.index('"', start_index) return response.text[start_index:end_index] def get_auth_token_and_device_id(): url = "https://euapi01.mediktor.com/backoffice/services/login" payload = "{"useCache":0,"apiVersion":"4.1.1","appVersion":"8.7.0"," ""appId":null,"deviceType":"WEB","deviceToken":null,"language":"pt_BR"," ""timezoneRaw":180,"authTokenRefreshExpiresIn":null}" headers = { 'authorization': f'Basic {get_auth_code()}', 'Content-Type': 'text/plain' } response = requests.request("POST", url, headers=headers, data=payload) return response.json()['authToken'], response.json()['deviceId'] def get_conclusion_list(auth_token, device_id): url = "https://euapi01.mediktor.com/backoffice/services/conclusionList" payload = "{"useCache":168,"apiVersion":"4.1.1","appVersion":"8.7.0"" ","appId":null,"deviceType":"WEB","deviceToken":null,"language":"pt_BR"," ""timezoneRaw":180,"deviceId":"" + device_id + ""}" headers = { 'accept': 'application/json, text/plain, */*', 'authorization': f'Bearer {auth_token}', 'content-type': 'application/json;charset=UTF-8' } response = requests.request("POST", url, headers=headers, data=payload) return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']] def get_details(conclusionId, auth_token, device_id): url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail" payload = "{"useCache":0,"apiVersion":"4.1.1","appVersion":"8.7.0"," ""appId":null,"deviceType":"WEB","deviceToken":null,"language":"en_EN"," ""timezoneRaw":180,"deviceId":"" + device_id + ""," ""conclusionId":"" + conclusionId + ""," ""conclusionTemplate":"conclusion_description_body","includeActions":true}" headers = { 'authorization': f'Bearer {auth_token}', 'content-type': 'application/json;charset=UTF-8' } response = requests.request("POST", url, headers=headers, data=payload) return response.text auth_token, device_id = get_auth_token_and_device_id() conclusion_list = get_conclusion_list(auth_token, device_id) for conclusion in conclusion_list: print(get_details(conclusion, auth_token, device_id))
It gets the json with the page items, but in loop number 230 it starts returning the following error and won’t leave the loop
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
What I’d like to do is, pass this all to a file so I can see if it’s getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it’s returning now
Advertisement
Answer
I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
import time import requests import pandas as pd from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import json class DicionarioSpider(scrapy.Spider): name = 'dicionario' allowed_domains = ['www.mediktor.com'] start_urls = ['http://www.mediktor.com/'] def parse(self, response): url = "https://www.mediktor.com/pt-br/glossario" option = Options() option.headless = True driver = webdriver.Chrome(options=option) driver.get(url) time.sleep(10) el_links = driver.find_elements( By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']") urls = [] nome_doenca = [] for i in range(len(el_links)): urls.append(el_links[i].get_attribute('href')) for link in urls: driver.get(link) myElem = WebDriverWait(driver, 5).until( EC.presence_of_element_located((By.XPATH, "//div[@class='mdk-conclusion-detail__main-title']" ))) nome_source = driver.find_element(By.XPATH, "//div[@class='mdk-conclusion-detail__main-title']" ).text nome_doenca.append(nome_source) driver.back() print(nome_doenca) driver.quit()
I just modified my code and didn’t use scrapy, just the selenium selectors.