Added:MNIST script

abilng · abilng · commit 257d86fd2ca6 · 2014-09-03T00:22:20.000+05:30
diff --git a/sample_config/MNIST/DBN/rbm_spec.json b/sample_config/MNIST/DBN/rbm_spec.json
@@ -1,18 +1,9 @@
 
 {
 
-  "comment" : "layers :: RBM layer configuration (No: of Nodes)",
   "hidden_layers": [1000, 1000, 1000],
-
-  "comment" : "activation :: sigmoid or tanh",
   "activation" : "sigmoid",
-
-  "comment" : "pretrained_layers:number of layers to be pre-trained",
   "pretrained_layers" : 3,
-
-  "comment" : "first_layer_type::type for the first layer; either 'bb' (Bernoulli-Bernoulli) or 'gb' (Gaussian-Bernoulli)",
   "first_layer_type" : "gb",
-
-  "comment" : "random_seed::",
   "random_seed" : 89677
 }
diff --git a/sample_config/MNIST/README.md b/sample_config/MNIST/README.md
@@ -0,0 +1,6 @@
+The MNIST database of handwritten digits, available from [this page](http://yann.lecun.com/exdb/mnist/), has a training set of 60,000 examples, and a test set of 10,000 examples.
+
+* Download [MNIST dataset](http://yann.lecun.com/exdb/mnist/).
+* Convert to NP Format using [mnist.py](mnist.py).
+* Try with any given recipe (or your own/modified).
+
diff --git a/sample_config/MNIST/SDA/sda_spec.json b/sample_config/MNIST/SDA/sda_spec.json
@@ -1,14 +1,7 @@
 {
 
-    "comment" : "hidden_layers :: RBM layer configuration (No: of Nodes)",
     "hidden_layers": [1000, 1000, 1000],
-
-    "comment":"",
     "corruption_levels":  [0.1, 0.2, 0.3],
-
-    "comment" : "activation :: sigmoid or tanh",
     "activation" : "tanh",
-
-    "comment" : "random_seed::",
     "random_seed" : 89677
 }
diff --git a/sample_config/MNIST/mnist.py b/sample_config/MNIST/mnist.py
@@ -0,0 +1,101 @@
+from struct import unpack
+import gzip,os,json,numpy
+from numpy import zeros, uint8
+from pylab import imshow, show, cm
+import cPickle as pickle
+
+def get_labeled_data(imagefile, labelfile):
+    """
+    Read input-vector (image) and target class (label, 0-9) and return
+    it as list of tuples.
+    """
+
+    # Open the images with gzip in read binary mode
+    images = gzip.open(imagefile, 'rb')
+    labels = gzip.open(labelfile, 'rb')
+
+    # Read the binary data
+
+    # We have to get big endian unsigned int. So we need '>I'
+
+    # Get metadata for images
+    images.read(4)  # skip the magic_number
+    number_of_images = images.read(4)
+    number_of_images = unpack('>I', number_of_images)[0]
+    rows = images.read(4)
+    rows = unpack('>I', rows)[0]
+    cols = images.read(4)
+    cols = unpack('>I', cols)[0]
+
+    # Get metadata for labels
+    labels.read(4)  # skip the magic_number
+    N = labels.read(4)
+    N = unpack('>I', N)[0]
+
+    if number_of_images != N:
+        raise Exception('The number of labels did not match '
+                            'the number of images.')
+
+    # Get the data
+    x = zeros((N, rows*cols), dtype=float)  # Initialize numpy array 
+    y = zeros((N, 1), dtype=uint8)  # Initialize numpy array
+    
+    tempx=zeros((rows,cols), dtype=float)
+    for i in range(N):
+        print 'Extracting ... {0}%\r'.format((i*100/N)),
+        for row in range(rows):
+            for col in range(cols):
+                tmp_pixel = images.read(1)  # Just a single byte
+                tmp_pixel = unpack('>B', tmp_pixel)[0]
+                tempx[row][col] = (float(tmp_pixel) / 255)
+        x[i] = tempx.flatten();
+        tmp_label = labels.read(1)
+        y[i] = unpack('>B', tmp_label)[0]
+
+    header = {}
+    header['featdim'] = rows*cols;
+    header['input_shape'] = [rows,cols,1]
+
+    return x,y,header
+
+ 
+def saveData(name,x,y,header):
+    
+    filehandle = open(name,'ab');
+    filehandle.write(json.dumps(header)+'\n')
+    dt={'names': ['d','l'],'formats': [('>f2',header['featdim']),'>i2']}
+    data = numpy.zeros(1,dtype= numpy.dtype(dt))
+    
+    for vector,label in zip(x,y):
+        data['d']=vector; data['l']=label;
+        data.tofile(filehandle); 
+    
+    filehandle.flush();
+    filehandle.close();
+
+if __name__ == '__main__':
+    print("Get testset")
+    (x,y,h)=get_labeled_data('t10k-images-idx3-ubyte.gz',
+                               't10k-labels-idx1-ubyte.gz')
+    print("Got %i testing datasets." % len(x))
+    saveData('test.dat',x,y,h);
+
+    print("Get trainingset")
+    (x,y,h)=get_labeled_data('train-images-idx3-ubyte.gz',
+                                'train-labels-idx1-ubyte.gz')
+    print("Got %i training datasets." % len(x))
+    seed=9090;
+    numpy.random.seed(seed)
+    numpy.random.shuffle(x) 
+    numpy.random.seed(seed)
+    numpy.random.shuffle(y)
+
+    N = len(x)
+    xtrain = x[:int(N*0.75)]
+    ytrain = y[:int(N*0.75)]
+    xval = x[int(N*0.75)+1:]
+    yval = y[int(N*0.75)+1:]
+
+    saveData('train.dat',xtrain,ytrain,h);
+    saveData('val.dat',xval,yval,h);
+

Original file line number	Diff line number	Diff line change
`@@ -1,14 +1,7 @@`
`1`	`1`	`{`
`2`	`2`
`3`		`- "comment" : "hidden_layers :: RBM layer configuration (No: of Nodes)",`
`4`	`3`	`"hidden_layers": [1000, 1000, 1000],`
`5`		`-`
`6`		`- "comment":"",`
`7`	`4`	`"corruption_levels": [0.1, 0.2, 0.3],`
`8`		`-`
`9`		`- "comment" : "activation :: sigmoid or tanh",`
`10`	`5`	`"activation" : "tanh",`
`11`		`-`
`12`		`- "comment" : "random_seed::",`
`13`	`6`	`"random_seed" : 89677`
`14`	`7`	`}`