I don't know how to interpolate in PySpark when the DataFrame contains many columns. Let me xplain. I need to group by webID and interpolate counts values at 1 minute interval. However, when I apply the below-shown code, Error: Answer Set the environment variable ARROW_PRE_0_15_IPC_FORMAT=1. https://spark.apache.org/docs/3.0.0-preview/sql-pyspark-pandas-with-arrow.html#compatibiliy-setting-for-pyarrow--0150-and-spark-23x-24x

Interpolation in PySpark throws java.lang.IllegalArgumentException

I don’t know how to interpolate in PySpark when the DataFrame contains many columns. Let me xplain.

from pyspark.sql.functions import to_timestamp

df = spark.createDataFrame([
    ("John",  "A", "2018-02-01 03:00:00", 60),  
    ("John",  "A", "2018-02-01 03:03:00", 66),  
    ("John",  "A", "2018-02-01 03:05:00", 70),  
    ("John",  "A", "2018-02-01 03:08:00", 76),  
    ("Mo",    "A", "2017-06-04 01:05:00", 10),  
    ("Mo",    "A", "2017-06-04 01:07:00", 20),  
    ("Mo",    "B", "2017-06-04 01:10:00", 35),  
    ("Mo",    "B", "2017-06-04 01:11:00", 40),
], ("webID", "aType", "timestamp", "counts")).withColumn(
  "timestamp", to_timestamp("timestamp")
)

JavaScript
​x
 
from pyspark.sql.functions import to_timestamp
​
df = spark.createDataFrame([
    ("John",  "A", "2018-02-01 03:00:00", 60),  
    ("John",  "A", "2018-02-01 03:03:00", 66),  
    ("John",  "A", "2018-02-01 03:05:00", 70),  
    ("John",  "A", "2018-02-01 03:08:00", 76),  
    ("Mo",    "A", "2017-06-04 01:05:00", 10),  
    ("Mo",    "A", "2017-06-04 01:07:00", 20),  
    ("Mo",    "B", "2017-06-04 01:10:00", 35),  
    ("Mo",    "B", "2017-06-04 01:11:00", 40),
], ("webID", "aType", "timestamp", "counts")).withColumn(
  "timestamp", to_timestamp("timestamp")
)
​

I need to group by webID and interpolate counts values at 1 minute interval. However, when I apply the below-shown code,

from operator import attrgetter
from pyspark.sql.types import StructType
from pyspark.sql.functions import pandas_udf, PandasUDFType

def resample(schema, freq, timestamp_col = "timestamp",**kwargs):
    @pandas_udf(
        StructType(sorted(schema, key=attrgetter("name"))), 
        PandasUDFType.GROUPED_MAP)
    def _(pdf):
        pdf.set_index(timestamp_col, inplace=True)
        pdf = pdf.resample(freq).interpolate()
        pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        pdf.sort_index(axis=1, inplace=True)
        return pdf
    return _


df.groupBy("webID").apply(resample(df.schema, "60S")).show()

JavaScript
 
from operator import attrgetter
from pyspark.sql.types import StructType
from pyspark.sql.functions import pandas_udf, PandasUDFType
​
def resample(schema, freq, timestamp_col = "timestamp",**kwargs):
    @pandas_udf(
        StructType(sorted(schema, key=attrgetter("name"))), 
        PandasUDFType.GROUPED_MAP)
    def _(pdf):
        pdf.set_index(timestamp_col, inplace=True)
        pdf = pdf.resample(freq).interpolate()
        pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        pdf.sort_index(axis=1, inplace=True)
        return pdf
    return _
​
​
df.groupBy("webID").apply(resample(df.schema, "60S")).show()
​

Error:

py4j.protocol.Py4JJavaError: An error occurred while calling o371.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 77 in stage 31.0 failed 4 times, most recent failure: Lost task 77.3 in stage 31.0 (TID 812, 27faa516aadb4c40b7d7586d7493143c0021c825663, executor 2): java.lang.IllegalArgumentException
    at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)

JavaScript
 
py4j.protocol.Py4JJavaError: An error occurred while calling o371.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 77 in stage 31.0 failed 4 times, most recent failure: Lost task 77.3 in stage 31.0 (TID 812, 27faa516aadb4c40b7d7586d7493143c0021c825663, executor 2): java.lang.IllegalArgumentException
    at java.nio.ByteBuffer.allocate(ByteBuffer.java:334)
​

Answer

Set the environment variable ARROW_PRE_0_15_IPC_FORMAT=1.

https://spark.apache.org/docs/3.0.0-preview/sql-pyspark-pandas-with-arrow.html#compatibiliy-setting-for-pyarrow–0150-and-spark-23x-24x

def resample(schema, freq, timestamp_col = "timestamp",**kwargs):
    @pandas_udf(
        StructType(sorted(schema, key=attrgetter("name"))), 
        PandasUDFType.GROUPED_MAP)
    def _(pdf):
        import os                                      # add this line
        os.environ['ARROW_PRE_0_15_IPC_FORMAT']='1'    # add this line
        pdf.set_index(timestamp_col, inplace=True)
        pdf = pdf.resample(freq).interpolate()
        pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        pdf.sort_index(axis=1, inplace=True)
        return pdf
    return _

JavaScript
 
def resample(schema, freq, timestamp_col = "timestamp",**kwargs):
    @pandas_udf(
        StructType(sorted(schema, key=attrgetter("name"))), 
        PandasUDFType.GROUPED_MAP)
    def _(pdf):
        import os                                      # add this line
        os.environ['ARROW_PRE_0_15_IPC_FORMAT']='1'    # add this line
        pdf.set_index(timestamp_col, inplace=True)
        pdf = pdf.resample(freq).interpolate()
        pdf.ffill(inplace=True)
        pdf.reset_index(drop=False, inplace=True)
        pdf.sort_index(axis=1, inplace=True)
        return pdf
    return _
​

Advertisement

Answer