001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.image.annotation.evaluation.datasets;
031
032import java.io.DataInputStream;
033import java.io.File;
034import java.io.IOException;
035import java.io.InputStream;
036import java.net.URL;
037import java.util.List;
038
039import org.apache.commons.io.FileUtils;
040import org.apache.commons.io.IOUtils;
041import org.apache.commons.vfs2.FileObject;
042import org.apache.commons.vfs2.FileSystemException;
043import org.apache.commons.vfs2.FileSystemManager;
044import org.apache.commons.vfs2.VFS;
045import org.openimaj.citation.annotation.Reference;
046import org.openimaj.citation.annotation.ReferenceType;
047import org.openimaj.data.DataUtils;
048import org.openimaj.data.dataset.GroupedDataset;
049import org.openimaj.data.dataset.ListBackedDataset;
050import org.openimaj.data.dataset.ListDataset;
051import org.openimaj.data.dataset.MapBackedDataset;
052import org.openimaj.experiment.annotations.DatasetDescription;
053import org.openimaj.image.MBFImage;
054import org.openimaj.image.annotation.evaluation.datasets.cifar.BinaryReader;
055
056/**
057 * CIFAR-10 Dataset. Contains 60000 tiny images in 10 classes (6000 per class).
058 * Each image is 32x32 pixels.
059 *
060 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
061 *
062 */
063@Reference(
064                type = ReferenceType.Article,
065                author = { "Krizhevsky, A.", "Hinton, G." },
066                title = "Learning multiple layers of features from tiny images",
067                year = "2009",
068                journal = "Master's thesis, Department of Computer Science, University of Toronto",
069                publisher = "Citeseer")
070@DatasetDescription(
071                name = "CIFAR-10",
072                description = "The CIFAR-10 dataset consists of 60000 32x32 colour "
073                                + "images in 10 classes, with 6000 images per class. There are "
074                                + "50000 training images and 10000 test images. The dataset is "
075                                + "divided into five training batches and one test batch, each "
076                                + "with 10000 images. The test batch contains exactly 1000 "
077                                + "randomly-selected images from each class. The training batches "
078                                + "contain the remaining images in random order, but some training "
079                                + "batches may contain more images from one class than another. "
080                                + "Between them, the training batches contain exactly 5000 images "
081                                + "from each class.",
082                creator = "Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton",
083                url = "http://www.cs.toronto.edu/~kriz/cifar.html",
084                downloadUrls = {
085                                "http://datasets.openimaj.org/cifar/cifar-10-binary.tar.gz",
086                })
087public class CIFAR10Dataset extends CIFARDataset {
088        private static final String DATA_TGZ = "cifar/cifar-10-binary.tar.gz";
089        private static final String DOWNLOAD_URL = "http://datasets.openimaj.org/cifar/cifar-10-binary.tar.gz";
090
091        private static final String[] TRAINING_FILES = {
092                        "data_batch_1.bin",
093                        "data_batch_2.bin",
094                        "data_batch_3.bin",
095                        "data_batch_4.bin",
096                        "data_batch_5.bin" };
097        private static final String TEST_FILE = "test_batch.bin";
098        private static final String CLASSES_FILE = "batches.meta.txt";
099
100        private CIFAR10Dataset() {
101        }
102
103        private static String downloadAndGetPath() throws IOException {
104                final File dataset = DataUtils.getDataLocation(DATA_TGZ);
105
106                if (!(dataset.exists())) {
107                        dataset.getParentFile().mkdirs();
108                        FileUtils.copyURLToFile(new URL(DOWNLOAD_URL), dataset);
109                }
110
111                return "tgz:file:" + dataset.toString() + "!cifar-10-batches-bin/";
112        }
113
114        /**
115         * Load the training images using the given reader. To load the images as
116         * {@link MBFImage}s, you would do the following: <code>
117         * CIFAR10Dataset.getTrainingImages(CIFAR10Dataset.MBFIMAGE_READER);
118         * </code>
119         *
120         * @param reader
121         *            the reader
122         * @return the training image dataset
123         * @throws IOException
124         */
125        public static <IMAGE> GroupedDataset<String, ListDataset<IMAGE>, IMAGE> getTrainingImages(BinaryReader<IMAGE> reader)
126                        throws IOException
127        {
128                final MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset = new MapBackedDataset<String, ListDataset<IMAGE>, IMAGE>();
129
130                final FileSystemManager fsManager = VFS.getManager();
131                final FileObject base = fsManager.resolveFile(downloadAndGetPath());
132
133                final List<String> classList = loadClasses(dataset, base);
134
135                for (final String t : TRAINING_FILES) {
136                        DataInputStream is = null;
137                        try {
138                                is = new DataInputStream(base.resolveFile(t).getContent().getInputStream());
139
140                                loadData(is, dataset, classList, reader);
141                        } finally {
142                                IOUtils.closeQuietly(is);
143                        }
144                }
145
146                return dataset;
147        }
148
149        private static <IMAGE> List<String> loadClasses(final MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset,
150                        final FileObject base) throws FileSystemException, IOException
151                        {
152                InputStream classStream = null;
153                List<String> classList = null;
154                try {
155                        classStream = base.resolveFile(CLASSES_FILE).getContent().getInputStream();
156                        classList = IOUtils.readLines(classStream);
157                } finally {
158                        IOUtils.closeQuietly(classStream);
159                }
160
161                for (final String clz : classList)
162                        dataset.put(clz, new ListBackedDataset<IMAGE>());
163                return classList;
164                        }
165
166        private static <IMAGE> void loadData(DataInputStream is,
167                        MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset, List<String> classList,
168                        BinaryReader<IMAGE> reader) throws IOException
169        {
170
171                for (int i = 0; i < 10000; i++) {
172                        final int clz = is.read();
173                        final String clzStr = classList.get(clz);
174                        final byte[] record = new byte[WIDTH * HEIGHT * 3];
175                        is.readFully(record);
176
177                        dataset.get(clzStr).add(reader.read(record));
178                }
179        }
180
181        /**
182         * Load the test images using the given reader. To load the images as
183         * {@link MBFImage}s, you would do the following: <code>
184         * CIFAR10Dataset.getTestImages(CIFAR10Dataset.MBFIMAGE_READER);
185         * </code>
186         *
187         * @param reader
188         *            the reader
189         * @return the test image dataset
190         * @throws IOException
191         */
192        public static <IMAGE> GroupedDataset<String, ListDataset<IMAGE>, IMAGE> getTestImages(BinaryReader<IMAGE> reader)
193                        throws IOException
194        {
195                final MapBackedDataset<String, ListDataset<IMAGE>, IMAGE> dataset = new MapBackedDataset<String, ListDataset<IMAGE>, IMAGE>();
196
197                final FileSystemManager fsManager = VFS.getManager();
198                final FileObject base = fsManager.resolveFile(downloadAndGetPath());
199
200                final List<String> classList = loadClasses(dataset, base);
201
202                DataInputStream is = null;
203                try {
204                        is = new DataInputStream(base.resolveFile(TEST_FILE).getContent().getInputStream());
205                        loadData(is, dataset, classList, reader);
206                } finally {
207                        IOUtils.closeQuietly(is);
208                }
209
210                return dataset;
211        }
212}