|
| 1 | +from struct import unpack |
| 2 | +import gzip,os,json,numpy |
| 3 | +from numpy import zeros, uint8 |
| 4 | +from pylab import imshow, show, cm |
| 5 | +import cPickle as pickle |
| 6 | + |
| 7 | +def get_labeled_data(imagefile, labelfile): |
| 8 | + """ |
| 9 | + Read input-vector (image) and target class (label, 0-9) and return |
| 10 | + it as list of tuples. |
| 11 | + """ |
| 12 | + |
| 13 | + # Open the images with gzip in read binary mode |
| 14 | + images = gzip.open(imagefile, 'rb') |
| 15 | + labels = gzip.open(labelfile, 'rb') |
| 16 | + |
| 17 | + # Read the binary data |
| 18 | + |
| 19 | + # We have to get big endian unsigned int. So we need '>I' |
| 20 | + |
| 21 | + # Get metadata for images |
| 22 | + images.read(4) # skip the magic_number |
| 23 | + number_of_images = images.read(4) |
| 24 | + number_of_images = unpack('>I', number_of_images)[0] |
| 25 | + rows = images.read(4) |
| 26 | + rows = unpack('>I', rows)[0] |
| 27 | + cols = images.read(4) |
| 28 | + cols = unpack('>I', cols)[0] |
| 29 | + |
| 30 | + # Get metadata for labels |
| 31 | + labels.read(4) # skip the magic_number |
| 32 | + N = labels.read(4) |
| 33 | + N = unpack('>I', N)[0] |
| 34 | + |
| 35 | + if number_of_images != N: |
| 36 | + raise Exception('The number of labels did not match ' |
| 37 | + 'the number of images.') |
| 38 | + |
| 39 | + # Get the data |
| 40 | + x = zeros((N, rows*cols), dtype=float) # Initialize numpy array |
| 41 | + y = zeros((N, 1), dtype=uint8) # Initialize numpy array |
| 42 | + |
| 43 | + tempx=zeros((rows,cols), dtype=float) |
| 44 | + for i in range(N): |
| 45 | + print 'Extracting ... {0}%\r'.format((i*100/N)), |
| 46 | + for row in range(rows): |
| 47 | + for col in range(cols): |
| 48 | + tmp_pixel = images.read(1) # Just a single byte |
| 49 | + tmp_pixel = unpack('>B', tmp_pixel)[0] |
| 50 | + tempx[row][col] = (float(tmp_pixel) / 255) |
| 51 | + x[i] = tempx.flatten(); |
| 52 | + tmp_label = labels.read(1) |
| 53 | + y[i] = unpack('>B', tmp_label)[0] |
| 54 | + |
| 55 | + header = {} |
| 56 | + header['featdim'] = rows*cols; |
| 57 | + header['input_shape'] = [rows,cols,1] |
| 58 | + |
| 59 | + return x,y,header |
| 60 | + |
| 61 | + |
| 62 | +def saveData(name,x,y,header): |
| 63 | + |
| 64 | + filehandle = open(name,'ab'); |
| 65 | + filehandle.write(json.dumps(header)+'\n') |
| 66 | + dt={'names': ['d','l'],'formats': [('>f2',header['featdim']),'>i2']} |
| 67 | + data = numpy.zeros(1,dtype= numpy.dtype(dt)) |
| 68 | + |
| 69 | + for vector,label in zip(x,y): |
| 70 | + data['d']=vector; data['l']=label; |
| 71 | + data.tofile(filehandle); |
| 72 | + |
| 73 | + filehandle.flush(); |
| 74 | + filehandle.close(); |
| 75 | + |
| 76 | +if __name__ == '__main__': |
| 77 | + print("Get testset") |
| 78 | + (x,y,h)=get_labeled_data('t10k-images-idx3-ubyte.gz', |
| 79 | + 't10k-labels-idx1-ubyte.gz') |
| 80 | + print("Got %i testing datasets." % len(x)) |
| 81 | + saveData('test.dat',x,y,h); |
| 82 | + |
| 83 | + print("Get trainingset") |
| 84 | + (x,y,h)=get_labeled_data('train-images-idx3-ubyte.gz', |
| 85 | + 'train-labels-idx1-ubyte.gz') |
| 86 | + print("Got %i training datasets." % len(x)) |
| 87 | + seed=9090; |
| 88 | + numpy.random.seed(seed) |
| 89 | + numpy.random.shuffle(x) |
| 90 | + numpy.random.seed(seed) |
| 91 | + numpy.random.shuffle(y) |
| 92 | + |
| 93 | + N = len(x) |
| 94 | + xtrain = x[:int(N*0.75)] |
| 95 | + ytrain = y[:int(N*0.75)] |
| 96 | + xval = x[int(N*0.75)+1:] |
| 97 | + yval = y[int(N*0.75)+1:] |
| 98 | + |
| 99 | + saveData('train.dat',xtrain,ytrain,h); |
| 100 | + saveData('val.dat',xval,yval,h); |
| 101 | + |
0 commit comments