How to change column value with pandas .apply() method

I stumbled upon an issue while trying to take data from a CSV file, assemble a key and then creating a new CSV file with only the necessary data.

Example data:

ID1       Data1   Data2  Price1 Color   Key          ID2       Data3   Price2
12345/6  950/000  Pd950   996    G    4/20017/6   4/20017/6   950/000   1108
12345/6  333/000  Pd333   402    G    4/20017/6   4/20017/6   333/000    501
12345/6  500/000  Pd500   550    G    4/20017/6   4/20017/6   500/000    577
12345/6  950/000  PT950   690    Pt   4/20017/6   4/20017/6   950/000    779
12345/6  600/000  Pt600   424    Pt   4/20017/6   4/20017/6   600/000    482

My code:

file_original = pd.read_csv(path_to_file, sep=";")

def getNum(itemPair):

    # Uses data to construct the output
    # Returns a string e.g. "4-12345-6_12345-6_XX-333"


def processItems(row):
    
    first_nr = row["ID1"].replace("/", "-")
    second_nr = row["ID2"].replace("/", "-")

    color = str(row["Color"]).lower()
    alloy = row["Data2"]

    Price1 = row["Price1"]
    Price2 = row["Price2"]

    alloys = []

    '''
    Performs a RegEx search on the alloy information. The pattern checks for
    two alloy data sets within one string.
    If result is none, there is only one data set and the next RegEx pattern is used.
    '''
    regEx = re.search(r"(w{0,2}d{3}).*?(w{0,2}d{3})", legierung)
    if regEx:
        if regEx.group(1):
            alloys.append(regEx.group(1))
        if regEx.group(2):
            alloys.append(regEx.group(2))
    else:
        regEx = re.search(r"(d{3})", legierung)
        if regEx.group(1):
            alloys.append(regEx.group(1))


    alloys = sorted(alloys, key=sortOrderAlloy)

    # Strips away any letters from the very first alloy entry
    if len(alloys[0]) > 3:
        alloys[0] = alloys[0][2:]


    colors = []
    i = 0
    while i < len(color):
        if color[i] == "p" and (i < (len(color) - 1) and (color[i + 1] == "t" or color[i + 1] == "d")):
            # If the current letter is "p" and the next letter is "t" or "d" consider them one color
            result = color[i] + color[i + 1]
            i += 1

            if result not in colors:
                # If the color isn't yet part of the list, append it
                colors.append(result)

        else:
            if color[i] not in colors:
                # If the color isn't yet part of the list, append it
                colors += color[i]
        i += 1


    colors = sorted(colors, key=sortOrderColor)


    # Check if there are more colors than alloys which means there are multiple colors per alloy
    if len(colors) > len(alloys):
        # If there is no color starting with "p", the alloy can be applied to every color
        if ("pt" not in colors) and ("pd" not in colors):
            # Counter variable for while loop
            c = len(alloys)

            while c < len(colors):
                alloys.append(alloys[0])
                c = len(alloys)
        
        else:
            # The amount of missing alloys for while loop
            count_diff = len(colors) - len(alloys)
            
            while count_diff != 0:
                alloys.insert(0, alloys[0])
                count_diff -= 1


    materials = []

    # Count variable for while loop
    i = 0

    while i < len(colors):
        materials.append(getMaterial(colors[i]))
        i += 1

    # Do some stuff to get the necessary data while assigning the following variables:
    # first_nr, second_nr, materials, alloys, price1, price2

    row["idNum"] = getNum(first_nr, second_nr, materials, alloys, price1, price2)

    row["price"] = (price1 + price2)

file_original["idNum"] = ""
file_original["price"] = 0

file_original.apply(processItems, axis=1)

file_processed = file_original[["idNum", "price"]]

file_processed.to_csv(path_output, sep=";", index=False, encoding="utf-8-sig")

JavaScript
​x
 
file_original = pd.read_csv(path_to_file, sep=";")
​
def getNum(itemPair):
​
    # Uses data to construct the output
    # Returns a string e.g. "4-12345-6_12345-6_XX-333"
​
​
def processItems(row):
    
    first_nr = row["ID1"].replace("/", "-")
    second_nr = row["ID2"].replace("/", "-")
​
    color = str(row["Color"]).lower()
    alloy = row["Data2"]
​
    Price1 = row["Price1"]
    Price2 = row["Price2"]
​
    alloys = []
​
    '''
    Performs a RegEx search on the alloy information. The pattern checks for
    two alloy data sets within one string.
    If result is none, there is only one data set and the next RegEx pattern is used.
    '''
    regEx = re.search(r"(w{0,2}d{3}).*?(w{0,2}d{3})", legierung)
    if regEx:
        if regEx.group(1):
            alloys.append(regEx.group(1))
        if regEx.group(2):
            alloys.append(regEx.group(2))
    else:
        regEx = re.search(r"(d{3})", legierung)
        if regEx.group(1):
            alloys.append(regEx.group(1))
​
​
    alloys = sorted(alloys, key=sortOrderAlloy)
​
    # Strips away any letters from the very first alloy entry
    if len(alloys[0]) > 3:
        alloys[0] = alloys[0][2:]
​
​
    colors = []
    i = 0
    while i < len(color):
        if color[i] == "p" and (i < (len(color) - 1) and (color[i + 1] == "t" or color[i + 1] == "d")):
            # If the current letter is "p" and the next letter is "t" or "d" consider them one color
            result = color[i] + color[i + 1]
            i += 1
​
            if result not in colors:
                # If the color isn't yet part of the list, append it
                colors.append(result)
​
        else:
            if color[i] not in colors:
                # If the color isn't yet part of the list, append it
                colors += color[i]
        i += 1
​
​
    colors = sorted(colors, key=sortOrderColor)
​
​
    # Check if there are more colors than alloys which means there are multiple colors per alloy
    if len(colors) > len(alloys):
        # If there is no color starting with "p", the alloy can be applied to every color
        if ("pt" not in colors) and ("pd" not in colors):
            # Counter variable for while loop
            c = len(alloys)
​
            while c < len(colors):
                alloys.append(alloys[0])
                c = len(alloys)
        
        else:
            # The amount of missing alloys for while loop
            count_diff = len(colors) - len(alloys)
            
            while count_diff != 0:
                alloys.insert(0, alloys[0])
                count_diff -= 1
​
​
    materials = []
​
    # Count variable for while loop
    i = 0
​
    while i < len(colors):
        materials.append(getMaterial(colors[i]))
        i += 1
​
    # Do some stuff to get the necessary data while assigning the following variables:
    # first_nr, second_nr, materials, alloys, price1, price2
​
    row["idNum"] = getNum(first_nr, second_nr, materials, alloys, price1, price2)
​
    row["price"] = (price1 + price2)
​
file_original["idNum"] = ""
file_original["price"] = 0
​
file_original.apply(processItems, axis=1)
​
file_processed = file_original[["idNum", "price"]]
​
file_processed.to_csv(path_output, sep=";", index=False, encoding="utf-8-sig")
​

The intended result would be a CSV file with just two columns “idNum” and “price” and the columns are limited as intended, however the rows all end up with the empty data I use to create these two new columns, so an empty string and the number 0.

After searching through Google I discovered that using .apply() does not allow you to directly change the data, instead it needs to be reassigned but when I change the respective line, I get an error.

# Old line
file_original.apply(processItems, axis=1)

# New line
file_original = file_original.apply(processItems, axis=1)

JavaScript
 
# Old line
file_original.apply(processItems, axis=1)
​
# New line
file_original = file_original.apply(processItems, axis=1)
​

The error reads as follows:

Traceback (most recent call last):
  File "prepareImport.py", line 278, in 
    file_processed = file_original[["idNum", "price"]]
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreseries.py", line 910, in __getitem__
    return self._get_with(key)
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreseries.py", line 958, in _get_with
    return self.loc[key]
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreindexing.py", line 1768, in __getitem__
    return self._getitem_axis(maybe_callable, axis=axis)
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreindexing.py", line 1954, in _getitem_axis
    return self._getitem_iterable(key, axis=axis)
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreindexing.py", line 1595, in _getitem_iterable
    keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreindexing.py", line 1552, in _get_listlike_indexer
    self._validate_read_indexer(
  File "C:UsersMY-USERanaconda3libsite-packagespandascoreindexing.py", line 1640, in _validate_read_indexer
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Index(['idNum', 'price'], dtype='object')] are in the [index]"

Line 278 => file_processed = file_original[["idNum", "price"]]

JavaScript
 
Line 278 => file_processed = file_original[["idNum", "price"]]
​

As far as I can tell the dataframe somehow looses the two columns I add before reassigning it.
So I decided to print it to the console rather than trying to extract two columns, the result shows that apparently the dataframe was transformed to a Series, so it has just one column filled with “None” of the datatype “object” although it keeps the full length of 10550.

I found a few other possible solutions to my original issue but they usually addressed an error the user made. Of course I checked my code for similar errors but couldn’t find any myself so I hope someone can help me out with this.

Thank you very much in advance! If I forgot to provide any kind of crucial information please let me know so I can add it.

Best regards
Eisman333

Edit: Expected output:

idNum                              price
4-12345-6_12345-6_XX-333            250
4-12345-6_12345-6_XX-585_YY-950    2130
4-12345-6_12345-6_XX-333_ZZ-500    1560

Answer

There is not enough of your logic/program for me to duplicate all the aspects of your processItems function, but I would suggest you think about separating out the individual column updates. You could implement something along the lines of the following where this method is called with the original dataframe:

def groom_data(idf):
    idf['ID1'] = idf.apply(lambda row: row.ID1.replace("/", '-'), axis= 1)
    idf['ID2'] = idf.apply(lambda row: row.ID2.replace("/", '-'), axis= 1)
    idf['Color'] = idf.apply(lambda row: row.Color.lower(), axis= 1)
    idf['Alloys'] = idf.apply(lambda row: process_alloys(row), axis= 1)
    idf['Materials'] = idf.apply(lambda row: process_items(row), axis= 1)
    idf['IdNum']= idf.apply(lambda row: getNum(row.ID1, row.ID2, row.Materials, row.Alloys, row.Price1, row.Price2))
    idf['Total_Price'] = idf.apply(lambda row: row.Price1 + row.Price2)
    return idf

JavaScript
 
def groom_data(idf):
    idf['ID1'] = idf.apply(lambda row: row.ID1.replace("/", '-'), axis= 1)
    idf['ID2'] = idf.apply(lambda row: row.ID2.replace("/", '-'), axis= 1)
    idf['Color'] = idf.apply(lambda row: row.Color.lower(), axis= 1)
    idf['Alloys'] = idf.apply(lambda row: process_alloys(row), axis= 1)
    idf['Materials'] = idf.apply(lambda row: process_items(row), axis= 1)
    idf['IdNum']= idf.apply(lambda row: getNum(row.ID1, row.ID2, row.Materials, row.Alloys, row.Price1, row.Price2))
    idf['Total_Price'] = idf.apply(lambda row: row.Price1 + row.Price2)
    return idf 
​

This would require to separate the logic for identifying the alloys and materials into two passes, I suggest that, since you need to access both materials and alloys when generating the IdNum column later. Once you have all the data in the original dataframe, you can then create a new output frame containing just the information you need for the final result.

Advertisement

Answer