to begin with I am a beginner and trying to achieve something which is currently out of my league. However, I hope you guys can help me out. Much appreciated.
I am trying to scrape the table from spaclens.com. I already tried using the out-of-the-box solution from Google sheets however the site is Java Script based which Google sheets cannot handle. I found some code online which I altered to fit my needs however I am stuck.
JavaScript
x
24
24
1
import pandas as pd
2
from selenium import webdriver
3
from bs4 import BeautifulSoup
4
5
# Step 1: Create a session and load the page
6
driver = webdriver.Chrome()
7
driver.get('https://www.spaclens.com/')
8
9
# Wait for the page to fully load
10
driver.implicitly_wait(5)
11
12
# Step 2: Parse HTML code and grab tables with Beautiful Soup
13
soup = BeautifulSoup(driver.page_source, 'lxml')
14
15
tables = soup.find_all('table')
16
17
# Step 3: Read tables with Pandas read_html()
18
dfs = pd.read_html(str(tables))
19
20
print(f'Total tables: {len(dfs)}')
21
print(dfs[0])
22
23
driver.close()
24
The code above gives me the following error:
JavaScript
1
53
53
1
---------------------------------------------------------------------------
2
ValueError Traceback (most recent call last)
3
<ipython-input-34-a32c8dbcef38> in <module>
4
16
5
17 # Step 3: Read tables with Pandas read_html()
6
---> 18 dfs = pd.read_html(str(tables))
7
19
8
20 print(f'Total tables: {len(dfs)}')
9
10
~anaconda3libsite-packagespandasutil_decorators.py in wrapper(*args, **kwargs)
11
294 )
12
295 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
13
--> 296 return func(*args, **kwargs)
14
297
15
298 return wrapper
16
17
~anaconda3libsite-packagespandasiohtml.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
18
1084 )
19
1085 validate_header_arg(header)
20
-> 1086 return _parse(
21
1087 flavor=flavor,
22
1088 io=io,
23
24
~anaconda3libsite-packagespandasiohtml.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
25
915 break
26
916 else:
27
--> 917 raise retained
28
918
29
919 ret = []
30
31
~anaconda3libsite-packagespandasiohtml.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
32
896
33
897 try:
34
--> 898 tables = p.parse_tables()
35
899 except ValueError as caught:
36
900 # if `io` is an io-like object, check if it's seekable
37
38
~anaconda3libsite-packagespandasiohtml.py in parse_tables(self)
39
215 list of parsed (header, body, footer) tuples from tables.
40
216 """
41
--> 217 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
42
218 return (self._parse_thead_tbody_tfoot(table) for table in tables)
43
219
44
45
~anaconda3libsite-packagespandasiohtml.py in _parse_tables(self, doc, match, attrs)
46
545
47
546 if not tables:
48
--> 547 raise ValueError("No tables found")
49
548
50
549 result = []
51
52
ValueError: No tables found
53
Do I need to alter the argument to find the table? Anyone can shed some light on this?
Thanks!!
Advertisement
Answer
Be easier to just grab the data from the source. Comes to you in a nice json format.
JavaScript
1
14
14
1
import pandas as pd
2
import requests
3
4
url = 'https://www.spaclens.com/company/page'
5
payload = {
6
'pageIndex': '1',
7
'pageSize': '9999',
8
'query': '{}',
9
'sort': '{}'}
10
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
11
12
jsonData = requests.get(url, headers=headers, params=payload).json()
13
df = pd.DataFrame(jsonData['data']['items'])
14
Output: 846 rows, 78 columns