forked from jervisfm/Digit-Recognizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreadDATA.m
190 lines (145 loc) · 5.57 KB
/
readDATA.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
% Jervis Muindi
% 28th November 2011
% MATLAB Code to read in MNIST Binary data files
% obtained from http://yann.lecun.com/exdb/mnist/
%Function return values:
% train_data -> cell array that has the training image matrices
% train_labels -> cell array that has the actual values of the train data
% images
% test_data -> test data image matrix
% test_labels -> test data image matrix labels
function [train_data, train_labels, test_data, test_labels] = readDATA()
path = './MNIST/train-images.idx3-ubyte';
%Note: Big endian means that a number is represented in a way such that
% the most significant bit is first. This is typical/consistent with how
% we value magnitude of normal numbers. E.g. in 10 in big-endian, the MSB is 1.
file = fopen(path, 'r', 'b'); %b is for big-endian.
%{
Format of train image files is as follows :
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number
0004 32 bit integer 60000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
%}
magicNumber = fread(file,1,'int32');
if(magicNumber ~= 2051)
disp('Error: Cannot find magic number of 2051. Please check to make sure that file is in IDX format');
return; % stop processsing
end
imagesNo = fread(file,1,'int32'); % Number of Image Samples
rowSize = fread(file,1,'int32'); %Number of Rows
colSize = fread(file,1,'int32'); % number of columns
% Lessons learnt
% 1) giving a matrix size as an argument when using matlab fread, what you get is a image
% matrix that has been tranposed. To undo this, we simply tranpose it once
% AND then it is the same format and it would have been if we had read
% in the data manually one byte at a time.
%
% 2) the other choice is of course to read in the image pixel
% values of the 28x28 image, one byte at a time, to an array.
%
% 3) there is NO difference in matlab between how uint8 and ubit8 are
% interpretted in the context of this data.
%Read in the actual training data.
for i = 1: imagesNo
img_sample = fread(file, [rowSize colSize], 'uint8') ; % pixel values stored as a single unsigned byte
img_sample = img_sample'; % transpose it so that image matrix faithfully represents stored binary data. See lessons learnt section above.
%Convert the values to doubles and normalize the image matrix by the
%maximum value.
img_sample = double(img_sample);
img_sample = img_sample / max(img_sample(:));
train_data{i} = img_sample;
end
fclose(file); %close file handle.
%{
Format of training label binary file is as follows:
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 60000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
%}
%% Read in the Training Labels.
path = './MNIST/train-labels.idx1-ubyte';
file = fopen(path, 'r', 'b'); %b is for big-endian.
magicNumber = fread(file,1,'int32');
if(magicNumber ~= 2049)
disp('Error: Cannot find magic number of 2049. Please check to make sure that file is in IDX format');
return; % stop processsing
end
itemsNo = fread(file,1,'int32'); % Number of Labelled Items Samples
for j = 1:itemsNo
train_labels{j} = fread(file, 1, 'uint8');
end
fclose(file);
%% Read in the Test Data
path = './MNIST/t10k-images.idx3-ubyte';
file = fopen(path, 'r', 'b'); %b is for big-endian.
magicNumber = fread(file,1,'int32');
if(magicNumber ~= 2051)
disp('Error: Cannot find magic number of 2051. Please check to make sure that file is in IDX format');
return; % stop processsing
end
testImagesNo = fread(file,1,'int32'); % Number of Image Samples
rowSize = fread(file,1,'int32'); %Number of Rows
colSize = fread(file,1,'int32'); % number of columns
%read in the actual data
for i = 1: testImagesNo
img = fread(file, [rowSize colSize], 'uint8') ; % pixel values stored as a single unsigned byte
img = img'; % transpose it so that image matrix faithfully represents stored binary data. See lessons learnt section above.
%Convert the values to doubles and normalize the image matrix by the
%maximum value.
img = double(img);
img = img / max(img(:));
test_data{i} = img;
end
fclose(file);
%% Read in the Test Data Labels
path = './MNIST/t10k-labels.idx1-ubyte';
file = fopen(path, 'r', 'b'); %b is for big-endian.
magicNumber = fread(file,1,'int32');
if(magicNumber ~= 2049)
disp('Error: Cannot find magic number of 2049. Please check to make sure that file is in IDX format');
return; % stop processsing
end
itemsNo = fread(file,1,'int32'); % Number of Labelled Items Samples
for j = 1:itemsNo
test_labels{j} = fread(file, 1, 'uint8');
end
disp('MNIST Data successully loaded');
function showImage()
[train_data, train_labels, test_data, test_labels] = readDATA();
figure(1);
X = [];
c = 1;
temp = [];
for i = 1:60000
x = train_data{i};
temp = [temp; x];
if(mod(i,20) == 1 )
%X = [X; x];
continue;
end
if(mod(i,20) == 0 )
X = [X; temp];
temp = [];
%C{c} = X;
%c = c + 1;
continue;
end
%X = [X x];
end
%Y = [];
%sz = size(C);
%for i = 1 : sz
% Y = [Y ; C{i}];
%end
imshow(X);