Skip to content
Advertisement

Group by from wide form in Pandas

I have a DataFrame like this one:

customer_type   age    satisfaction    design  food    wifi    service    distance
        Loyal    28   Not Satisfied         0    1        2          2        13.5
        Loyal    55       Satisfied         5    3        5          4        34.2
     Disloyal    36   Not Satisfied         2    0        2          4        55.8
     Disloyal    28   Not Satisfied         3    1        2          2        13.5
     Disloyal    33   Not Satisfied         2    1        2          2        13.5
     Disloyal    35   Not Satisfied         2    1        2          2        13.5
     Disloyal    39   Not Satisfied         1    1        2          2        13.5
     Disloyal    31   Not Satisfied         2    1        2          2        13.5
        Loyal    28   Not Satisfied         0    1        2          2        13.5
     Disloyal    31   Not Satisfied         2    1        2          2        13.5
     Disloyal    40   Not Satisfied         2    1        2          2        13.5
     Disloyal    35   Not Satisfied         2    1        2          2        13.5
     Disloyal    35   Not Satisfied         2    2        2          2        13.5

I want to find out the characteristics of the Disloyal and Not Satisfied customers that are between 30 and 40 years old, grouping them by the service they have rated:

 service   ratings_count    age   age_count   population_pct
  design               8    40    1           7.69
                            36    1           7.69
                            35    3           23.07
                            33    1           7.69
                            31    2           15.38
    food               1    35    1           7.69

I suspect I have to use melt but I can’t figure out how to groupby from there.

Advertisement

Answer

With the following toy dataframe, inspired by yours but a bit more heterogeneous:

import pandas as pd

df = pd.DataFrame(
    {
        "customer_type": [
            "Loyal",
            "Loyal",
            "Disloyal",
            "Disloyal",
            "Loyal",
            "Disloyal",
            "Disloyal",
            "Disloyal",
            "Loyal",
            "Disloyal",
            "Disloyal",
            "Loyal",
            "Disloyal",
        ],
        "age": [28, 55, 27, 31, 42, 35, 39, 31, 28, 51, 40, 35, 35],
        "satisfaction": [
            "Not Satisfied",
            "Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Satisfied",
            "Not Satisfied",
            "Not Satisfied",
            "Satisfied",
        ],
        "design": [0, 5, 2, 4, 2, 2, 1, 0, 1, 2, 0, 2, 1],
        "food": [1, 3, 0, 0, 1, 0, 1, 0, 1, 0, 1, 3, 2],
        "wifi": [2, 5, 2, 1, 2, 2, 0, 0, 1, 2, 2, 0, 2],
        "service": [2, 4, 1, 2, 2, 2, 0, 2, 3, 0, 1, 2, 2],
        "distance": [
            13.5,
            34.2,
            55.8,
            21.2,
            23.5,
            13.5,
            9.0,
            13.5,
            13.5,
            22.7,
            13.5,
            13.5,
            18.4,
        ],
    }
)

Here is one less subtel, although easier, way to do it:

# Setup
amenities = ["design", "food", "wifi", "service"]
dfs = []
new_df = df[
    df["age"].between(30, 40)
    & (df["customer_type"] == "Disloyal")
    & (df["satisfaction"] == "Not Satisfied")
]

# Iterate on each amenity, groupby age and add other columns/values
for amenity in amenities:
    temp_df = (
        new_df[new_df[amenity] > 0]
        .groupby("age")
        .agg({amenity: "count", "distance": "mean"})
        .reset_index(drop=False)
        .rename(columns={amenity: "age_count", "distance": "distance_mean"})
        .pipe(
            lambda df_: df_.assign(
                population_pct=(100 * df_["age_count"] / df_["age_count"].sum()).round(
                    2
                )
            )
        )
    )
    temp_df.loc[0, "amenities"] = amenity
    temp_df.loc[0, "ratings_count"] = temp_df["age_count"].sum()
    temp_df = pd.concat(
        [
            temp_df,
            new_df[(new_df[amenity] != 0)]
            .sort_values(by=["age"])
            .groupby("age")
            .agg({amenity: list})
            .reset_index(drop=True),
        ],
        axis=1,
    )
    temp_df = temp_df.rename(columns={amenity: "ratings"})
    temp_df["ratings"] = temp_df["ratings"].apply(lambda x: x[0] if len(x) == 1 else x)
    dfs.append(temp_df)

# Get final dataframe and cleanup
new_df = (
    pd.concat(dfs)
    .fillna(method="ffill")
    .reindex(
        columns=[
            "amenities",
            "ratings_count",
            "age",
            "age_count",
            "ratings",
            "distance_mean",
            "population_pct",
        ]
    )
    .astype({"ratings_count": "int"})
    .set_index(["amenities", "ratings_count"])
)

So that:

print(new_df)
# Output
                         age  age_count ratings  distance_mean  population_pct
amenities ratings_count                                                       
design    3               31          1       4          21.20           33.33
          3               35          1       2          13.50           33.33
          3               39          1       1           9.00           33.33
food      2               39          1       1           9.00           50.00
          2               40          1       1          13.50           50.00
wifi      3               31          1       1          21.20           33.33
          3               35          1       2          13.50           33.33
          3               40          1       2          13.50           33.33
service   4               31          2  [2, 2]          17.35           50.00
          4               35          1       2          13.50           25.00
          4               40          1       1          13.50           25.00

From here, you can filter ratings_count <=2 like this:

print(new_df[new_df.index.get_level_values(1) <= 2])
# Output
                         age  age_count ratings  distance_mean  population_pct
amenities ratings_count                                                       
food      2               39          1       1            9.0            50.0
          2               40          1       1           13.5            50.0
User contributions licensed under: CC BY-SA
2 People found this is helpful
Advertisement