i am using pandas to read an excel file from s3 and i will be doing some operation in one of the column and write the new version in same location. Basically new version will overwrite the original version.
with csv file i am able to achieve using the below code but not sure of excel(.xlsx). Please can someone help.
JavaScript
x
10
10
1
key = source_path + folder_name + "/" + file_name
2
s3 = boto3.client('s3')
3
obj = s3.get_object(Bucket=bucketname, Key=key)
4
csv_buf = StringIO()
5
df = pd.read_csv(obj['Body'])
6
df["col1"] = df["col1"] * 100
7
df.to_csv(csv_buf, index=False)
8
csv_buf.seek(0)
9
s3.put_object(Bucket=bucketname, Body=csv_buf.getvalue(), Key=key)
10
i tried to use the same code using read_excel and to_excel getting UnsupportedOperation: seek error.
JavaScript
1
10
10
1
key = source_path + folder_name + "/" + file_name
2
s3 = boto3.client('s3')
3
obj = s3.get_object(Bucket=bucketname, Key=key)
4
csv_buf = StringIO()
5
df = pd.read_excel(obj['Body'])
6
df["col1"] = df["col1"] * 100
7
df.to_excel(csv_buf, index=False)
8
csv_buf.seek(0)
9
s3.put_object(Bucket=bucketname, Body=csv_buf.getvalue(), Key=key)
10
Error Message:
JavaScript
1
39
39
1
---------------------------------------------------------------------------
2
UnsupportedOperation Traceback (most recent call last)
3
<ipython-input-298-9363cf7d6609> in <module>
4
6 obj = s3.get_object(Bucket=raw_bucket, Key=key)
5
7 csv_buf = StringIO()
6
----> 8 df = pd.read_excel(obj['Body'])
7
9 df["patient_ID"] = df["patient_ID"] * 100
8
10 df.to_excel(csv_buf, index=False)
9
10
~anaconda3libsite-packagespandasutil_decorators.py in wrapper(*args, **kwargs)
11
297 )
12
298 warnings.warn(msg, FutureWarning, stacklevel=stacklevel)
13
--> 299 return func(*args, **kwargs)
14
300
15
301 return wrapper
16
17
~anaconda3libsite-packagespandasioexcel_base.py in read_excel(io, sheet_name, header, names, index_col, usecols, squeeze, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, thousands, comment, skipfooter, convert_float, mangle_dupe_cols, storage_options)
18
334 if not isinstance(io, ExcelFile):
19
335 should_close = True
20
--> 336 io = ExcelFile(io, storage_options=storage_options, engine=engine)
21
337 elif engine and engine != io.engine:
22
338 raise ValueError(
23
24
~anaconda3libsite-packagespandasioexcel_base.py in __init__(self, path_or_buffer, engine, storage_options)
25
1069 ext = "xls"
26
1070 else:
27
-> 1071 ext = inspect_excel_format(
28
1072 content=path_or_buffer, storage_options=storage_options
29
1073 )
30
31
~anaconda3libsite-packagespandasioexcel_base.py in inspect_excel_format(path, content, storage_options)
32
951 ) as handle:
33
952 stream = handle.handle
34
--> 953 stream.seek(0)
35
954 buf = stream.read(PEEK_SIZE)
36
955 if buf is None:
37
38
UnsupportedOperation: seek
39
sample file
s.no patient_id 1 100 2 200 3 300 4 400 5 500
Advertisement
Answer
Try the below solution it worked for me. Now you can directly pass the s3 location of the file.
JavaScript
1
11
11
1
key = source_path + folder_name + "/" + file_name
2
path = "s3://" + bucket_name + "/" + key
3
if target_file[num].endswith(".xlsx"):
4
df = pd.read_excel(path)
5
df["col1"] = df["col1"] * 100
6
df.to_excel(path)
7
elif target_file[num].endswith(".csv"):
8
df = pd.read_csv(path)
9
df["col1"] = df["col1"] * 100
10
df.to_csv(path)
11