I am attempting to make a model that can predict whether a credit card transaction was fraudulent or not. My dataset is available on Kaggle. Everything works up to when I fit my model, when I get this error:
ValueError: Data cardinality is ambiguous: x sizes: 7433462 y sizes: 284807 Make sure all arrays contain the same number of samples.
Could someone help me figure out what’s wrong?
import numpy as np import pandas as pd import tensorflow as tf from tensorflow import keras from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Activation, Dense from tensorflow.keras.optimizers import Adam from tensorflow.keras.metrics import categorical_crossentropy from sklearn.utils import shuffle from sklearn.preprocessing import MinMaxScaler data = pd.read_csv("creditcard.csv") trainSamples = data['Class'] labels = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] trainSamples = data[labels] trainLabels = np.array(trainLabels) trainSamples = np.array(trainSamples) trainLabels = shuffle(trainLabels) trainSamples = shuffle(trainSamples) scaler = MinMaxScaler(feature_range = (0, 1)) scaledTrainSample = scaler.fit_transform(trainSamples.reshape(-1,1)) model = Sequential([ Dense(units = 16, input_shape = (1, ), activation = 'relu'), Dense(units = 32, activation = 'relu'), Dense(units = 2, activation = 'softmax') ]) model.compile(optimizer = Adam(learning_rate = 0.0001), loss = 'sparse_categorical_crossentropy', metrics = ['accuracy']) model.fit(x = scaledTrainSample, y = trainLabels, validation_split = 0.1, batch_size = 10, epochs = 300, verbose = 2)
Advertisement
Answer
The main issue with your code is that the model’s input shape should be 30 and not 1 as you have 30 features, while the output shape should be 1 and not 2 since you have only one binary label (i.e. only two classes, 0 or 1). There were also a few other bugs which were corrected in the code below.
import numpy as np import pandas as pd import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense from tensorflow.keras.optimizers import Adam from sklearn.utils import shuffle from sklearn.preprocessing import MinMaxScaler tf.random.set_seed(0) # import the data df = pd.read_csv('creditcard.csv') # extract the features and target X = df.drop(labels=['Class'], axis=1).values y = df['Class'].values # count the number of classes print(np.unique(y)) # [0 1] # shuffle the data X, y = shuffle(X, y, random_state=42) # scale the features scaler = MinMaxScaler(feature_range=(0, 1)) X = scaler.fit_transform(X) # build the model model = Sequential([ Dense(units=16, activation='relu', input_shape=(X.shape[1], )), Dense(units=32, activation='relu'), Dense(units=1, activation='sigmoid') ]) # fit the model model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy']) model.fit(x=X, y=y, validation_split=0.1, batch_size=256, epochs=3) # Epoch 1/3 # 1002/1002 [==============================] - 1s 761us/step - loss: 0.1787 - accuracy: 0.9983 - val_loss: 0.0193 - val_accuracy: 0.9981 # Epoch 2/3 # 1002/1002 [==============================] - 1s 684us/step - loss: 0.0136 - accuracy: 0.9983 - val_loss: 0.0130 - val_accuracy: 0.9981 # Epoch 3/3 # 1002/1002 [==============================] - 1s 680us/step - loss: 0.0119 - accuracy: 0.9983 - val_loss: 0.0127 - val_accuracy: 0.9981