I am trying to scrape a table from url. The table seems to have no name. I have scraped the links and its text to csv using below code.
from bs4 import BeautifulSoup import requests import re url = 'https://www.sbp.org.pk/smefd/circulars/2020/index.htm' r = requests.get(url) soup = BeautifulSoup(r.content, 'html.parser') anchor = soup.find_all('a',href = re.compile('2020')) file_name = "Circulars.csv" f = open(file_name,'w') header = 'Cir_Name ,Linksn' f.write(header) for link in anchor: href = link.get('href') text = link.getText() text1 = text.replace("n", "") f.write(text1.replace(',','|') + "," + href.replace("," ,"|") +"n") print("Done")
What I need is to scrape the table as it is. currently I am able to get just links. I need other columns too.
I have tried following code but failed. I am able to get to the table but thats not enough
from bs4 import BeautifulSoup import requests import re url = 'https://www.sbp.org.pk/smefd/circulars/2020/index.htm' r = requests.get(url) soup = BeautifulSoup(r.content, 'html.parser') table = soup.find_all('table',attrs={"width": "95%",'border':'0','cellpadding':"1"}) tablee = soup.find_all('tr',attrs={'table',attrs={"width": "95%",'border':'0','cellpadding':"1"}}) print(tablee)
Advertisement
Answer
To save data to CSV, you can use this example:
import requests import pandas as pd from bs4 import BeautifulSoup url = 'https://www.sbp.org.pk/smefd/circulars/2020/index.htm' soup = BeautifulSoup(requests.get(url).content, 'html.parser') table = soup.select_one('table[width="95%"]:not(:has(table))') all_data = [] for row in table.select('tr:not(:has(td[colspan]))'): tds = [td.get_text(strip=True).replace('n', ' ').replace('t', ' ') for td in row.select('td') if td.get_text(strip=True)] tds += [row.find_previous('td', {'colspan': '4'}).get_text(strip=True).replace('n', ' '), row.a['href']] all_data.append(tds) df = pd.DataFrame(all_data) print(df) df.to_csv('data.csv', index=False, header=False)
Saves data.csv
(screenshot from LibreOffice):