Group by from wide form in Pandas

Question

I have a DataFrame like this one: I want to find out the characteristics of the Disloyal and Not Satisfied customers that are between 30 and 40 years old, grouping them by the service they have rated: I suspect I have to use melt but I can&#8217;t figure out how to groupby from there. Answer With the followin…

Accepted Answer

With the following toy dataframe, inspired by yours but a bit more heterogeneous:import pandas as pddf = pd.DataFrame(    {        "customer_type": [            "Loyal",            "Loyal",            "Disloyal",            "Disloyal",            "Loyal",            "Disloyal",            "Disloyal",            "Disloyal",            "Loyal",            "Disloyal",            "Disloyal",            "Loyal",            "Disloyal",        ],        "age": [28, 55, 27, 31, 42, 35, 39, 31, 28, 51, 40, 35, 35],        "satisfaction": [            "Not Satisfied",            "Satisfied",            "Not Satisfied",            "Not Satisfied",            "Not Satisfied",            "Not Satisfied",            "Not Satisfied",            "Not Satisfied",            "Not Satisfied",            "Satisfied",            "Not Satisfied",            "Not Satisfied",            "Satisfied",        ],        "design": [0, 5, 2, 4, 2, 2, 1, 0, 1, 2, 0, 2, 1],        "food": [1, 3, 0, 0, 1, 0, 1, 0, 1, 0, 1, 3, 2],        "wifi": [2, 5, 2, 1, 2, 2, 0, 0, 1, 2, 2, 0, 2],        "service": [2, 4, 1, 2, 2, 2, 0, 2, 3, 0, 1, 2, 2],        "distance": [            13.5,            34.2,            55.8,            21.2,            23.5,            13.5,            9.0,            13.5,            13.5,            22.7,            13.5,            13.5,            18.4,        ],    })Here is one less subtel, although easier, way to do it:# Setupamenities = ["design", "food", "wifi", "service"]dfs = []new_df = df[    df["age"].between(30, 40)    & (df["customer_type"] == "Disloyal")    & (df["satisfaction"] == "Not Satisfied")]# Iterate on each amenity, groupby age and add other columns/valuesfor amenity in amenities:    temp_df = (        new_df[new_df[amenity] > 0]        .groupby("age")        .agg({amenity: "count", "distance": "mean"})        .reset_index(drop=False)        .rename(columns={amenity: "age_count", "distance": "distance_mean"})        .pipe(            lambda df_: df_.assign(                population_pct=(100 * df_["age_count"] / df_["age_count"].sum()).round(                    2                )            )        )    )    temp_df.loc[0, "amenities"] = amenity    temp_df.loc[0, "ratings_count"] = temp_df["age_count"].sum()    temp_df = pd.concat(        [            temp_df,            new_df[(new_df[amenity] != 0)]            .sort_values(by=["age"])            .groupby("age")            .agg({amenity: list})            .reset_index(drop=True),        ],        axis=1,    )    temp_df = temp_df.rename(columns={amenity: "ratings"})    temp_df["ratings"] = temp_df["ratings"].apply(lambda x: x[0] if len(x) == 1 else x)    dfs.append(temp_df)# Get final dataframe and cleanupnew_df = (    pd.concat(dfs)    .fillna(method="ffill")    .reindex(        columns=[            "amenities",            "ratings_count",            "age",            "age_count",            "ratings",            "distance_mean",            "population_pct",        ]    )    .astype({"ratings_count": "int"})    .set_index(["amenities", "ratings_count"]))So that:print(new_df)# Output                         age  age_count ratings  distance_mean  population_pctamenities ratings_count                                                       design    3               31          1       4          21.20           33.33          3               35          1       2          13.50           33.33          3               39          1       1           9.00           33.33food      2               39          1       1           9.00           50.00          2               40          1       1          13.50           50.00wifi      3               31          1       1          21.20           33.33          3               35          1       2          13.50           33.33          3               40          1       2          13.50           33.33service   4               31          2  [2, 2]          17.35           50.00          4               35          1       2          13.50           25.00          4               40          1       1          13.50           25.00From here, you can filter ratings_count <=2 like this:print(new_df[new_df.index.get_level_values(1) <= 2])# Output                         age  age_count ratings  distance_mean  population_pctamenities ratings_count                                                       food      2               39          1       1            9.0            50.0          2               40          1       1           13.5            50.0

Advertisement

Answer