I have written an intent classification program. This is first trained with training data and then tested with test data. The training process takes a few seconds. What is the best way to save such a training, so that it does not have to be trained again with every call? Is it enough to save train_X and train_y? or does the model have to be saved somehow?
import numpy as np import pandas as pd import os import spacy import csv from sklearn.preprocessing import LabelEncoder from sklearn.svm import SVC # read data from csv def read_data(path): with open(path, 'r') as csvfile: readcsv = csv.reader(csvfile, delimiter=',') labels = [] sentences = [] for row in readcsv: label = row[0] sentence = row[1] labels.append(label) sentences.append(sentence) return sentences, labels # Loading Test Data sentences_test, labels_test = read_data('./a_test.csv') # print out the first two rows print(sentences_test[:2], 'n') print(labels_test[:2]) # Loading Training Data sentences_train, labels_train = read_data('./a_train.csv') # Load the spacy model: nlp nlp = spacy.load('en_core_web_lg') embedding_dim = nlp.vocab.vectors_length print(embedding_dim) def encode_sentences(sentences): # Calculate number of sentences n_sentences = len(sentences) print('Length :-', n_sentences) X = np.zeros((n_sentences, embedding_dim)) # y = np.zeros((n_sentences, embedding_dim)) # Iterate over the sentences for idx, sentence in enumerate(sentences): # Pass each sentence to the nlp object to create a document doc = nlp(sentence) # Save the document's .vector attribute to the corresponding row in # X X[idx, :] = doc.vector return X train_X = encode_sentences(sentences_train) test_X = encode_sentences(sentences_test) # every label gets his own number def label_encoding(labels): # Calculate the length of labels n_labels = len(labels) print('Number of labels :-', n_labels) le = LabelEncoder() y = le.fit_transform(labels) print(y[:100]) print('Length of y :- ', y.shape) return y train_y = label_encoding(labels_train) test_y = label_encoding(labels_test) df1 = pd.read_csv('./a_train.csv', delimiter=',') df1.dataframeName = 'a_train.csv' nRow, nCol = df1.shape print(f'There are {nRow} rows and {nCol} columns') df1.sample(10) df1.describe() # X_train and y_train was given. def svc_training(X, y): # Create a support vector classifier clf = SVC(C=1) # Fit the classifier using the training data clf.fit(X, y) return clf model = svc_training(train_X, train_y) print(model.predict(train_X)) # Validation Step def svc_validation(model, X, y): # Predict the labels of the test set y_pred = model.predict(X) # Count the number of correct predictions n_correct = 0 for i in range(len(y)): if y_pred[i] == y[i]: n_correct += 1 print("Predicted {0} correctly out of {1} training examples".format(n_correct, len(y))) #svc_validation(model, train_X, train_y) #svc_validation(model, test_X, test_y)
Advertisement
Answer
From what I gather you are trying to save the data so every time you run the program you don’t need to calculate it again. The best I can do is suggest you write the data to a dedicated text file to serve as a data dump.
with open("datadump.txt", w) as file: file.write(train_X) file.write(train_y)