I am trying to search for multiple keywords (in the list of filteredList) and get a list of each search result. This is the code I have tried below:
from googleapiclient.discovery import build import csv import pprint my_api_key = "xxx" my_cse_id = "xxx" def google_search(search_term, api_key, cse_id, **kwargs): service = build("customsearch", "v1", developerKey=api_key) res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() return res['items'] filteredList = ['Optimal Elektronika', 'Evrascon', ] words = [ 'vakansiya' ] newDictList = [] # this is the htmlSnippets, link and also htmlTitle for filtering over the list of the dictionaries keyValList = ['link', 'htmlTitle', 'htmlSnippet'] for word in filteredList: results = google_search(word, my_api_key, my_cse_id, num=5) # print(results) newDict = dict() for result in results: for (key, value) in result.items(): if key in keyValList: if word in newDict['htmlSnippet']: pass newDict[key] = pprint.pprint(value) newDictList.append(newDict) print(newDictList)
Running the answer script
The error code I got (Running the answer script):
Traceback (most recent call last): File "/Users/valizadavali/PycharmProjects/webScrape/GCS.py", line 39, in <module> items = google_search(word, API_KEY, CSE_ID, num=5) File "/Users/valizadavali/PycharmProjects/webScrape/GCS.py", line 11, in google_search return res['items'] KeyError: 'items'
Advertisement
Answer
I don’t have API keys to run this code but I see few mistakes:
When you use
for items in filteredList:
then you get word from list, not its index so you can’t compare it with number.
To get number you would use
for items in range(len(filteredList)):
But instead of this version better use first version but then use items
instead of filterd[items]
in
results = google_search(items, my_api_key, my_cse_id, num=5)
If you choose version with range(len(filteredList)):
then don’t add 1 to items – because then you get numbers 1..6
instead of 0..5
so you skip first element filteredList[0]
and it doesn’t search first word. And later you try to get filteredList[6]
which doesn’t exist on list and you get your error message.
for word in filteredList: results = google_search(word, my_api_key, my_cse_id, num=5) print(results) newDict = dict() for result in results: for (key, value) in result.items(): if key in keyValList: newDict[key] = value newDictList.append(newDict) print(newDictList)
BTW: you have to create newDict = dict()
in every loop.
BTW: standard print()
and pprint.pprint()
is used only to sends text on screen and always returns None
so you can’t assign displayed text to variable. If you have to format text then use string formatting for this.
EDIT: version with range(len(...))
which is not preferred in Python.
for index in range(len(filteredList)): results = google_search(filteredList[index], my_api_key, my_cse_id, num=5) print(results) newDict = dict() for result in results: for (key, value) in result.items(): if key in keyValList: newDict[key] = value newDictList.append(newDict) print(newDictList)
EDIT:
from googleapiclient.discovery import build import requests API_KEY = "AIzXXX" CSE_ID = "013XXX" def google_search(search_term, api_key, cse_id, **kwargs): service = build("customsearch", "v1", developerKey=api_key) res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute() return res['items'] words = [ 'Semkir sistem', 'Evrascon', 'Baku Electronics', 'Optimal Elektroniks', 'Avtostar', 'Improtex', # 'Wayback Machine' ] filtered_results = list() keys = ['cacheId', 'link', 'htmlTitle', 'htmlSnippet', ] for word in words: items = google_search(word, API_KEY, CSE_ID, num=5) for item in items: #print(item.keys()) # to check if every item has the same keys. It seems some items don't have 'cacheId' row = dict() # row of data in final list with results for key in keys: row[key] = item.get(key) # None if there is no `key` in `item` #row[key] = item[key] # ERROR if there is no `key` in `item` # generate link to cached page if row['cacheId']: row['link_cache'] = 'https://webcache.googleusercontent.com/search?q=cache:{}:{}'.format(row['cacheId'], row['link']) # TODO: read HTML from `link_cache` and get full text. # Maybe module `newpaper` can be useful for some pages. # For other pages module `urllib.request` or `requests` can be needed. row['html'] = requests.get(row['link_cache']).text else: row['link_cache'] = None row['html'] = '' # check word in title and snippet. Word may use upper and lower case chars so I convert to lower case to skip this problem. # It doesn't work if text use native chars - ie. cyrylica lower_word = word.lower() if (lower_word in row['htmlTitle'].lower()) or (lower_word in row['htmlSnippet'].lower()) or (lower_word in row['html'].lower()): filtered_results.append(row) else: print('SKIP:', word) print(' :', row['link']) print(' :', row['htmlTitle']) print(' :', row['htmlSnippet']) print('-----') for item in filtered_results: print('htmlTitle:', item['htmlTitle']) print('link:', item['link']) print('cacheId:', item['cacheId']) print('link_cache:', item['link_cache']) print('part of html:', item['html'][:300]) print('---')