I was trying to make a crawler to follow links, with this code
JavaScript
x
52
52
1
import scrapy
2
import time
3
import requests
4
import pandas as pd
5
from bs4 import BeautifulSoup
6
from selenium import webdriver
7
from selenium.webdriver.common.by import By
8
from selenium.webdriver.chrome.options import Options
9
import json
10
11
12
class DicionarioSpider(scrapy.Spider):
13
name = 'dicionario'
14
allowed_domains = ['www.mediktor.com']
15
start_urls = ['http://www.mediktor.com/']
16
17
def start_requests(self):
18
url = "https://www.mediktor.com/pt-br/glossario"
19
options = Options()
20
options.headless = True
21
driver = webdriver.Chrome(options=options)
22
driver.get(url)
23
time.sleep(10)
24
25
doencas = driver.find_elements(
26
By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
27
for doenca in doencas:
28
url = doenca.get_attribute('href')
29
yield scrapy.Request(url)
30
driver.quit()
31
32
def parse(self, response):
33
urls = response.css(
34
'.mdk-dictionary-list__glossary-item a::attr(href)')
35
for url in urls:
36
yield response.follow(url.get(), callback=self.parse_info)
37
38
def parse_info(self, response):
39
contents = response.css('div.page-glossary-detail__main-content')
40
for desc in response.css('div.mdk-conclusion-detail__main-description'):
41
desc = response.css('p ::text').getall()
42
yield {
43
'desc': desc
44
}
45
for content in contents:
46
yield{
47
'name': content.css(
48
'div.mdk-conclusion-detail__main-title ::text').get().strip(),
49
'espec': content.css(
50
'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
51
}
52
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
JavaScript
1
60
60
1
import pandas as pd
2
import requests
3
from bs4 import BeautifulSoup
4
5
6
def get_auth_code():
7
url = "https://www.mediktor.com/vendor.js"
8
response = requests.get(url)
9
start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"')
10
end_index = response.text.index('"', start_index)
11
return response.text[start_index:end_index]
12
13
14
def get_auth_token_and_device_id():
15
url = "https://euapi01.mediktor.com/backoffice/services/login"
16
payload = "{"useCache":0,"apiVersion":"4.1.1","appVersion":"8.7.0","
17
""appId":null,"deviceType":"WEB","deviceToken":null,"language":"pt_BR","
18
""timezoneRaw":180,"authTokenRefreshExpiresIn":null}"
19
headers = {
20
'authorization': f'Basic {get_auth_code()}',
21
'Content-Type': 'text/plain'
22
}
23
response = requests.request("POST", url, headers=headers, data=payload)
24
return response.json()['authToken'], response.json()['deviceId']
25
26
27
def get_conclusion_list(auth_token, device_id):
28
url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
29
payload = "{"useCache":168,"apiVersion":"4.1.1","appVersion":"8.7.0""
30
","appId":null,"deviceType":"WEB","deviceToken":null,"language":"pt_BR","
31
""timezoneRaw":180,"deviceId":"" + device_id + ""}"
32
headers = {
33
'accept': 'application/json, text/plain, */*',
34
'authorization': f'Bearer {auth_token}',
35
'content-type': 'application/json;charset=UTF-8'
36
}
37
response = requests.request("POST", url, headers=headers, data=payload)
38
return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]
39
40
41
def get_details(conclusionId, auth_token, device_id):
42
url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
43
payload = "{"useCache":0,"apiVersion":"4.1.1","appVersion":"8.7.0","
44
""appId":null,"deviceType":"WEB","deviceToken":null,"language":"en_EN","
45
""timezoneRaw":180,"deviceId":"" + device_id + "","
46
""conclusionId":"" + conclusionId + "","
47
""conclusionTemplate":"conclusion_description_body","includeActions":true}"
48
headers = {
49
'authorization': f'Bearer {auth_token}',
50
'content-type': 'application/json;charset=UTF-8'
51
}
52
response = requests.request("POST", url, headers=headers, data=payload)
53
return response.text
54
55
56
auth_token, device_id = get_auth_token_and_device_id()
57
conclusion_list = get_conclusion_list(auth_token, device_id)
58
for conclusion in conclusion_list:
59
print(get_details(conclusion, auth_token, device_id))
60
It gets the json with the page items, but in loop number 230 it starts returning the following error and won’t leave the loop
JavaScript
1
2
1
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
2
What I’d like to do is, pass this all to a file so I can see if it’s getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it’s returning now
Advertisement
Answer
I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
JavaScript
1
50
50
1
import time
2
import requests
3
import pandas as pd
4
from bs4 import BeautifulSoup
5
from selenium import webdriver
6
from selenium.webdriver.support.ui import WebDriverWait
7
from selenium.webdriver.support import expected_conditions as EC
8
from selenium.webdriver.common.by import By
9
from selenium.webdriver.chrome.options import Options
10
import json
11
12
13
class DicionarioSpider(scrapy.Spider):
14
name = 'dicionario'
15
allowed_domains = ['www.mediktor.com']
16
start_urls = ['http://www.mediktor.com/']
17
18
def parse(self, response):
19
url = "https://www.mediktor.com/pt-br/glossario"
20
option = Options()
21
option.headless = True
22
driver = webdriver.Chrome(options=option)
23
driver.get(url)
24
time.sleep(10)
25
26
el_links = driver.find_elements(
27
By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
28
urls = []
29
nome_doenca = []
30
31
for i in range(len(el_links)):
32
urls.append(el_links[i].get_attribute('href'))
33
34
for link in urls:
35
driver.get(link)
36
37
myElem = WebDriverWait(driver, 5).until(
38
EC.presence_of_element_located((By.XPATH,
39
"//div[@class='mdk-conclusion-detail__main-title']"
40
)))
41
nome_source = driver.find_element(By.XPATH,
42
"//div[@class='mdk-conclusion-detail__main-title']"
43
).text
44
45
nome_doenca.append(nome_source)
46
47
driver.back()
48
print(nome_doenca)
49
driver.quit()
50
I just modified my code and didn’t use scrapy, just the selenium selectors.