When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last): File "sizeer.py", line 68, in <module> writer.writerow([name,color,price]) ValueError: I/O operation on closed file
import requests import csv from bs4 import BeautifulSoup proxies = { "http":"http://195.189.60.97:3128", "http":"http://103.78.75.165:8080", "http":"http://212.87.220.2:3128", "http":"http://88.99.134.61:8080", "http":"http://103.102.139.178:8080", "http":"http://218.60.8.83:3129", "http":"http://124.121.105.193:8888", "http":"http://198.237.114.54:8080", "http":"http://36.67.106.58:8080", "http":"http://35.214.241.28:3128" } base_url = ... page = requests.get(base_url, proxies=proxies) if page.status_code != 200: exit("Page wasn't parsed") soup = BeautifulSoup(page.content, 'lxml') with open("result.csv", "w") as file: writer = csv.writer(file) writer.writerow(["Product","Color","Price"]) #Get categories category_wrapper = soup.find_all(class_="m-menu_subItem") categories = [] for cw in category_wrapper: anchor = cw.find("a", recursive=False) categories.append(anchor['href']) #Iterrate categories for category in categories: cat_page = requests.get(base_url + category, proxies=proxies) cat_soup = BeautifulSoup(cat_page.content, 'lxml') products_wrapper = cat_soup.find(class_="b-productList") cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span") max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0] #Iterrate category with pagination and get products for i in range(1, max_page+1): cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies) cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml') product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink") for link in product_links: #Get product data product_page = requests.get(base_url+link['href'], proxies=proxies) product_soup = BeautifulSoup(product_page.content, 'lxml') #Get product variations variations = product_soup.find_all(class_="m-productDescr_colorItem") #If there are variations if len(variations) > 0: for v in variations: variation_page = requests.get(base_url+v['href'], proxies=proxies) variation_soup = BeautifulSoup(variation_page.content, 'lxml') price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0] name = variation_soup.find(class_="m-productDescr_headline").text.strip() color = v['title'] print(name) print(color) print(price) print("-------------") #Save in csv writer.writerow([name,color,price]) print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
Advertisement
Answer
with open("result.csv", "w") as file: writer = csv.writer(file) writer.writerow(["Product","Color","Price"])
The file closes at the end of the with
block – that is the block’s purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for
loop set in a function:
def do_stuff_with(categories, writer): for category in categories: # lots of logic here # use `writer.writerow` when needed # Get everything else set up that doesn't need the file, first categories = ... # do the BeautifulSoup input stuff # then we can open the file and use the function: with open("result.csv", "w") as file: writer = csv.writer(file) writer.writerow(["Product","Color","Price"]) do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations
for a single product. Or you can have a function to handle the creation of the categories
data, and return
it.