I have written an intent classification program. This is first trained with training data and then tested with test data. The training process takes a few seconds. What is the best way to save such a training, so that it does not have to be trained again with every call? Is it enough to save train_X and train_y? or does the model have to be saved somehow?
JavaScript
x
121
121
1
import numpy as np
2
import pandas as pd
3
import os
4
import spacy
5
import csv
6
from sklearn.preprocessing import LabelEncoder
7
from sklearn.svm import SVC
8
9
10
# read data from csv
11
def read_data(path):
12
with open(path, 'r') as csvfile:
13
readcsv = csv.reader(csvfile, delimiter=',')
14
labels = []
15
sentences = []
16
for row in readcsv:
17
label = row[0]
18
sentence = row[1]
19
labels.append(label)
20
sentences.append(sentence)
21
return sentences, labels
22
23
24
# Loading Test Data
25
26
sentences_test, labels_test = read_data('./a_test.csv')
27
28
# print out the first two rows
29
print(sentences_test[:2], 'n')
30
print(labels_test[:2])
31
32
# Loading Training Data
33
sentences_train, labels_train = read_data('./a_train.csv')
34
35
# Load the spacy model: nlp
36
nlp = spacy.load('en_core_web_lg')
37
embedding_dim = nlp.vocab.vectors_length
38
print(embedding_dim)
39
40
41
def encode_sentences(sentences):
42
# Calculate number of sentences
43
n_sentences = len(sentences)
44
45
print('Length :-', n_sentences)
46
47
X = np.zeros((n_sentences, embedding_dim))
48
# y = np.zeros((n_sentences, embedding_dim))
49
50
# Iterate over the sentences
51
for idx, sentence in enumerate(sentences):
52
# Pass each sentence to the nlp object to create a document
53
doc = nlp(sentence)
54
# Save the document's .vector attribute to the corresponding row in
55
# X
56
X[idx, :] = doc.vector
57
return X
58
59
60
train_X = encode_sentences(sentences_train)
61
test_X = encode_sentences(sentences_test)
62
63
64
# every label gets his own number
65
def label_encoding(labels):
66
# Calculate the length of labels
67
n_labels = len(labels)
68
print('Number of labels :-', n_labels)
69
70
le = LabelEncoder()
71
y = le.fit_transform(labels)
72
73
print(y[:100])
74
print('Length of y :- ', y.shape)
75
return y
76
77
78
train_y = label_encoding(labels_train)
79
test_y = label_encoding(labels_test)
80
81
df1 = pd.read_csv('./a_train.csv', delimiter=',')
82
df1.dataframeName = 'a_train.csv'
83
nRow, nCol = df1.shape
84
print(f'There are {nRow} rows and {nCol} columns')
85
86
87
df1.sample(10)
88
df1.describe()
89
90
91
# X_train and y_train was given.
92
def svc_training(X, y):
93
# Create a support vector classifier
94
clf = SVC(C=1)
95
96
# Fit the classifier using the training data
97
clf.fit(X, y)
98
99
return clf
100
101
102
model = svc_training(train_X, train_y)
103
print(model.predict(train_X))
104
105
# Validation Step
106
def svc_validation(model, X, y):
107
# Predict the labels of the test set
108
y_pred = model.predict(X)
109
110
# Count the number of correct predictions
111
n_correct = 0
112
for i in range(len(y)):
113
if y_pred[i] == y[i]:
114
n_correct += 1
115
116
print("Predicted {0} correctly out of {1} training examples".format(n_correct, len(y)))
117
118
119
#svc_validation(model, train_X, train_y)
120
#svc_validation(model, test_X, test_y)
121
Advertisement
Answer
From what I gather you are trying to save the data so every time you run the program you don’t need to calculate it again. The best I can do is suggest you write the data to a dedicated text file to serve as a data dump.
JavaScript
1
4
1
with open("datadump.txt", w) as file:
2
file.write(train_X)
3
file.write(train_y)
4