I am trying to create a PyTorch Dataset and DataLoader object using a sample data.
This is the tab seperated dataset:
1 0 0.171429 1 0 0 0.966805 0 0 1 0.085714 0 1 0 0.188797 1 1 0 0.000000 0 0 1 0.690871 2 1 0 0.057143 0 1 0 1.000000 1 0 1 1.000000 0 0 1 0.016598 2 1 0 0.171429 1 0 0 0.802905 0 0 1 0.171429 1 0 0 0.966805 1 1 0 0.257143 0 1 0 0.329876 0
This is the code to create the Dataset above and DataLoader object:
import numpy as np import torch as T device = T.device("cpu") # to Tensor or Module # --------------------------------------------------- # predictors and label in same file # data has been normalized and encoded like: # sex age region income politic # [0] [2] [3] [6] [7] # 1 0 0.057143 0 1 0 0.690871 2 class PeopleDataset(T.utils.data.Dataset): def __init__(self, src_file, num_rows=None): x_tmp = np.loadtxt(src_file, max_rows=num_rows, usecols=range(0,7), delimiter="t", skiprows=0, dtype=np.float32) y_tmp = np.loadtxt(src_file, max_rows=num_rows, usecols=7, delimiter="t", skiprows=0, dtype=np.long) self.x_data = T.tensor(x_tmp, dtype=T.float32).to(device) self.y_data = T.tensor(y_tmp, dtype=T.long).to(device) def __len__(self): return len(self.x_data) # required def __getitem__(self, idx): if T.is_tensor(idx): idx = idx.tolist() preds = self.x_data[idx, 0:7] pol = self.y_data[idx] sample = { 'predictors' : preds, 'political' : pol } return sample # --------------------------------------------------- def main(): print("nBegin PyTorch DataLoader demo ") # 0. miscellaneous prep T.manual_seed(0) np.random.seed(0) print("nSource data looks like: ") print("1 0 0.171429 1 0 0 0.966805 0") print("0 1 0.085714 0 1 0 0.188797 1") print(" . . . ") # 1. create Dataset and DataLoader object print("nCreating Dataset and DataLoader ") train_file = "people_train.txt" train_ds = PeopleDataset(train_file, num_rows=8) bat_size = 3 train_ldr = T.utils.data.DataLoader(train_ds, batch_size=bat_size, shuffle=True) # 2. iterate thru training data twice for epoch in range(2): print("n==============================n") print("Epoch = " + str(epoch)) for (batch_idx, batch) in enumerate(train_ldr): print("nBatch = " + str(batch_idx)) X = batch['predictors'] # [3,7] # Y = T.flatten(batch['political']) # Y = batch['political'] # [3] print(X) print(Y) print("n==============================") print("nEnd demo ") if __name__ == "__main__": main()
The code is simply saved with the filename “demo.py“. The code should succesfully execute once the command ‘python demo.py‘ is executed on a command prompt screen. I use Anaconda Prompt which has Torch (v 1.10) installed.
I have tried numerous methods to get the above working, but I only get an error which says:
Source data looks like: 1 0 0.171429 1 0 0 0.966805 0 0 1 0.085714 0 1 0 0.188797 1 . . . Creating Dataset and DataLoader --------------------------------------------------------------------------- IndexError Traceback (most recent call last) <ipython-input-8-cfb1177991f2> in <module>() 81 82 if __name__ == "__main__": ---> 83 main() 4 frames <ipython-input-8-cfb1177991f2> in main() 59 60 train_file = "people_train.txt" ---> 61 train_ds = PeopleDataset(train_file, num_rows=8) 62 63 bat_size = 3 <ipython-input-8-cfb1177991f2> in __init__(self, src_file, num_rows) 20 x_tmp = np.loadtxt(src_file, max_rows=num_rows, 21 usecols=range(0,7), delimiter="t", ---> 22 skiprows=0, dtype=np.float32) 23 y_tmp = np.loadtxt(src_file, max_rows=num_rows, 24 usecols=7, delimiter="t", skiprows=0, /usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in loadtxt(fname, dtype, comments, delimiter, converters, skiprows, usecols, unpack, ndmin, encoding, max_rows) 1137 # converting the data 1138 X = None -> 1139 for x in read_data(_loadtxt_chunksize): 1140 if X is None: 1141 X = np.array(x, dtype) /usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in read_data(chunk_size) 1058 continue 1059 if usecols: -> 1060 vals = [vals[j] for j in usecols] 1061 if len(vals) != N: 1062 line_num = i + skiprows + 1 /usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py in <listcomp>(.0) 1058 continue 1059 if usecols: -> 1060 vals = [vals[j] for j in usecols] 1061 if len(vals) != N: 1062 line_num = i + skiprows + 1 IndexError: list index out of range
I am not able to see which part of the index is wrong, as I don’t feel there seem to be anything wrong with the indexing. Can someone please help me ?
Advertisement
Answer
Your data seems to be space-separated, not tab-separated. So, when you specify delimiter="t"
, the entire row is read as a single column. But because of usecols=range(0,7)
, NumPy expects there to be seven columns, and throws an error when trying to iterate over them.
To fix this, either change the whitespaces to tabs in your data, or change the delimiter argument to delimiter=" "
.