001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.image.annotation.evaluation.datasets; 031 032import java.io.DataInputStream; 033import java.io.File; 034import java.io.IOException; 035import java.io.InputStream; 036import java.net.URL; 037import java.util.List; 038 039import org.apache.commons.io.FileUtils; 040import org.apache.commons.io.IOUtils; 041import org.apache.commons.vfs2.FileObject; 042import org.apache.commons.vfs2.FileSystemException; 043import org.apache.commons.vfs2.FileSystemManager; 044import org.apache.commons.vfs2.VFS; 045import org.openimaj.citation.annotation.Reference; 046import org.openimaj.citation.annotation.ReferenceType; 047import org.openimaj.data.DataUtils; 048import org.openimaj.data.dataset.GroupedDataset; 049import org.openimaj.data.dataset.ListBackedDataset; 050import org.openimaj.data.dataset.ListDataset; 051import org.openimaj.data.dataset.MapBackedDataset; 052import org.openimaj.experiment.annotations.DatasetDescription; 053import org.openimaj.image.MBFImage; 054import org.openimaj.image.annotation.evaluation.datasets.cifar.BinaryReader; 055 056/** 057 * CIFAR-10 Dataset. Contains 60000 tiny images in 10 classes (6000 per class). 058 * Each image is 32x32 pixels. 059 * 060 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 061 * 062 */ 063@Reference( 064 type = ReferenceType.Article, 065 author = { "Krizhevsky, A.", "Hinton, G." }, 066 title = "Learning multiple layers of features from tiny images", 067 year = "2009", 068 journal = "Master's thesis, Department of Computer Science, University of Toronto", 069 publisher = "Citeseer") 070@DatasetDescription( 071 name = "CIFAR-10", 072 description = "The CIFAR-10 dataset consists of 60000 32x32 colour " 073 + "images in 10 classes, with 6000 images per class. There are " 074 + "50000 training images and 10000 test images. The dataset is " 075 + "divided into five training batches and one test batch, each " 076 + "with 10000 images. The test batch contains exactly 1000 " 077 + "randomly-selected images from each class. The training batches " 078 + "contain the remaining images in random order, but some training " 079 + "batches may contain more images from one class than another. " 080 + "Between them, the training batches contain exactly 5000 images " 081 + "from each class.", 082 creator = "Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton", 083 url = "http://www.cs.toronto.edu/~kriz/cifar.html", 084 downloadUrls = { 085 "http://datasets.openimaj.org/cifar/cifar-10-binary.tar.gz", 086 }) 087public class CIFAR10Dataset extends CIFARDataset { 088 private static final String DATA_TGZ = "cifar/cifar-10-binary.tar.gz"; 089 private static final String DOWNLOAD_URL = "http://datasets.openimaj.org/cifar/cifar-10-binary.tar.gz"; 090 091 private static final String[] TRAINING_FILES = { 092 "data_batch_1.bin", 093 "data_batch_2.bin", 094 "data_batch_3.bin", 095 "data_batch_4.bin", 096 "data_batch_5.bin" }; 097 private static final String TEST_FILE = "test_batch.bin"; 098 private static final String CLASSES_FILE = "batches.meta.txt"; 099 100 private CIFAR10Dataset() { 101 } 102 103 private static String downloadAndGetPath() throws IOException { 104 final File dataset = DataUtils.getDataLocation(DATA_TGZ); 105 106 if (!(dataset.exists())) { 107 dataset.getParentFile().mkdirs(); 108 FileUtils.copyURLToFile(new URL(DOWNLOAD_URL), dataset); 109 } 110 111 return "tgz:file:" + dataset.toString() + "!cifar-10-batches-bin/"; 112 } 113 114 /** 115 * Load the training images using the given reader. To load the images as 116 * {@link MBFImage}s, you would do the following: <code> 117 * CIFAR10Dataset.getTrainingImages(CIFAR10Dataset.MBFIMAGE_READER); 118 * </code> 119 * 120 * @param reader 121 * the reader 122 * @return the training image dataset 123 * @throws IOException 124 */ 125 public static <IMAGE> GroupedDataset<String, ListDataset<IMAGE>, IMAGE> getTrainingImages(BinaryReader<IMAGE> reader) 126 throws IOException 127 { 128 final MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset = new MapBackedDataset<String, ListDataset<IMAGE>, IMAGE>(); 129 130 final FileSystemManager fsManager = VFS.getManager(); 131 final FileObject base = fsManager.resolveFile(downloadAndGetPath()); 132 133 final List<String> classList = loadClasses(dataset, base); 134 135 for (final String t : TRAINING_FILES) { 136 DataInputStream is = null; 137 try { 138 is = new DataInputStream(base.resolveFile(t).getContent().getInputStream()); 139 140 loadData(is, dataset, classList, reader); 141 } finally { 142 IOUtils.closeQuietly(is); 143 } 144 } 145 146 return dataset; 147 } 148 149 private static <IMAGE> List<String> loadClasses(final MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset, 150 final FileObject base) throws FileSystemException, IOException 151 { 152 InputStream classStream = null; 153 List<String> classList = null; 154 try { 155 classStream = base.resolveFile(CLASSES_FILE).getContent().getInputStream(); 156 classList = IOUtils.readLines(classStream); 157 } finally { 158 IOUtils.closeQuietly(classStream); 159 } 160 161 for (final String clz : classList) 162 dataset.put(clz, new ListBackedDataset<IMAGE>()); 163 return classList; 164 } 165 166 private static <IMAGE> void loadData(DataInputStream is, 167 MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset, List<String> classList, 168 BinaryReader<IMAGE> reader) throws IOException 169 { 170 171 for (int i = 0; i < 10000; i++) { 172 final int clz = is.read(); 173 final String clzStr = classList.get(clz); 174 final byte[] record = new byte[WIDTH * HEIGHT * 3]; 175 is.readFully(record); 176 177 dataset.get(clzStr).add(reader.read(record)); 178 } 179 } 180 181 /** 182 * Load the test images using the given reader. To load the images as 183 * {@link MBFImage}s, you would do the following: <code> 184 * CIFAR10Dataset.getTestImages(CIFAR10Dataset.MBFIMAGE_READER); 185 * </code> 186 * 187 * @param reader 188 * the reader 189 * @return the test image dataset 190 * @throws IOException 191 */ 192 public static <IMAGE> GroupedDataset<String, ListDataset<IMAGE>, IMAGE> getTestImages(BinaryReader<IMAGE> reader) 193 throws IOException 194 { 195 final MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset = new MapBackedDataset<String, ListDataset<IMAGE>, IMAGE>(); 196 197 final FileSystemManager fsManager = VFS.getManager(); 198 final FileObject base = fsManager.resolveFile(downloadAndGetPath()); 199 200 final List<String> classList = loadClasses(dataset, base); 201 202 DataInputStream is = null; 203 try { 204 is = new DataInputStream(base.resolveFile(TEST_FILE).getContent().getInputStream()); 205 loadData(is, dataset, classList, reader); 206 } finally { 207 IOUtils.closeQuietly(is); 208 } 209 210 return dataset; 211 } 212}