I’m reading the book, Web Scraping with Python which has the following function to retrieve external links found on a page:
JavaScript
x
11
11
1
#Retrieves a list of all external links found on a page
2
def getExternalLinks(bs, excludeUrl):
3
externalLinks = []
4
#Finds all links that start with "http" that do
5
#not contain the current URL
6
for link in bs.find_all('a', {'href' : re.compile('^(http|www)((?!'+excludeUrl+').)*$')}):
7
if link.attrs['href'] is not None:
8
if link.attrs['href'] not in externalLinks:
9
externalLinks.append(link.attrs['href'])
10
return externalLinks
11
The problem is that it does not work the way it should. When i run it using the URL: http://www.oreilly.com, it returns this:
JavaScript
1
3
1
bs = makeSoup('https://www.oreilly.com') # Makes a BeautifulSoup Object
2
getExternalLinks(bs, 'https://www.oreilly.com')
3
Output:
JavaScript
1
34
34
1
['https://www.oreilly.com',
2
'https://oreilly.com/sign-in.html',
3
'https://oreilly.com/online-learning/try-now.html',
4
'https://oreilly.com/online-learning/index.html',
5
'https://oreilly.com/online-learning/individuals.html',
6
'https://oreilly.com/online-learning/teams.html',
7
'https://oreilly.com/online-learning/enterprise.html',
8
'https://oreilly.com/online-learning/government.html',
9
'https://oreilly.com/online-learning/academic.html',
10
'https://oreilly.com/online-learning/pricing.html',
11
'https://www.oreilly.com/partner/reseller-program.html',
12
'https://oreilly.com/conferences/',
13
'https://oreilly.com/ideas/',
14
'https://oreilly.com/about/approach.html',
15
'https://www.oreilly.com/conferences/',
16
'https://conferences.oreilly.com/velocity/vl-ny',
17
'https://conferences.oreilly.com/artificial-intelligence/ai-eu',
18
'https://www.safaribooksonline.com/public/free-trial/',
19
'https://www.safaribooksonline.com/team-setup/',
20
'https://www.oreilly.com/online-learning/enterprise.html',
21
'https://www.oreilly.com/about/approach.html',
22
'https://conferences.oreilly.com/software-architecture/sa-eu',
23
'https://conferences.oreilly.com/velocity/vl-eu',
24
'https://conferences.oreilly.com/software-architecture/sa-ny',
25
'https://conferences.oreilly.com/strata/strata-ca',
26
'http://shop.oreilly.com/category/customer-service.do',
27
'https://twitter.com/oreillymedia',
28
'https://www.facebook.com/OReilly/',
29
'https://www.linkedin.com/company/oreilly-media',
30
'https://www.youtube.com/user/OreillyMedia',
31
'https://www.oreilly.com/emails/newsletters/',
32
'https://itunes.apple.com/us/app/safari-to-go/id881697395',
33
'https://play.google.com/store/apps/details?id=com.safariflow.queue']
34
Question:
Why are the first 16-17 entries considered “external links”? They belong to the same domain of http://www.oreilly.com.
Advertisement
Answer
JavaScript
1
21
21
1
import urllib
2
from bs4 import BeautifulSoup
3
from urllib.request import urlopen
4
from urllib.parse import urlsplit
5
import re
6
ext = set()
7
def getExt(url):
8
o = urllib.parse.urlsplit(url)
9
html = urlopen(url)
10
bs = BeautifulSoup(html, 'html.parser')
11
for link in bs.find_all('a', href = re.compile('^((https://)|(http://))')):
12
if 'href' in link.attrs:
13
if o.netloc in (link.attrs['href']):
14
continue
15
else:
16
ext.add(link.attrs['href'])
17
getExt('https://oreilly.com/')
18
for i in ext:
19
print(i)
20
21