001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.ml.dataset;
031
032import gov.sandia.cognition.math.matrix.Vector;
033import gov.sandia.cognition.math.matrix.VectorFactory;
034
035import java.io.BufferedReader;
036import java.io.InputStreamReader;
037import java.util.Arrays;
038import java.util.HashSet;
039import java.util.Set;
040
041import org.apache.logging.log4j.Logger;
042import org.apache.logging.log4j.LogManager;
043
044import org.openimaj.data.dataset.Dataset;
045import org.openimaj.data.dataset.ListBackedDataset;
046import org.openimaj.data.dataset.ListDataset;
047import org.openimaj.data.dataset.MapBackedDataset;
048import org.openimaj.experiment.annotations.DatasetDescription;
049
050/**
051 * A {@link Dataset} instance of the standard wine clustering experiment found
052 * here:
053 *
054 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
055 *
056 */
057@DatasetDescription(
058                name = "Wine Data Set",
059                description = "" +
060                                "These data are the results of a chemical analysis of wines grown in the same region in " +
061                                "Italy but derived from three different cultivars. The analysis determined the quantities " +
062                                "of 13 constituents found in each of the three types of wines." +
063                                "" +
064                                "I think that the initial data set had around 30 variables, but for some reason I only have " +
065                                "the 13 dimensional version. I had a list of what the 30 or so variables were, but a.) I lost" +
066                                " it, and b.), I would not know which 13 variables are included in the set." +
067                                "" +
068                                "The attributes are (dontated by Riccardo Leardi, riclea '@' anchem.unige.it )" +
069                                "1) Alcohol" +
070                                "2) Malic acid" +
071                                "3) Ash" +
072                                "4) Alcalinity of ash" +
073                                "5) Magnesium" +
074                                "6) Total phenols" +
075                                "7) Flavanoids" +
076                                "8) Nonflavanoid phenols" +
077                                "9) Proanthocyanins" +
078                                "10)Color intensity" +
079                                "11)Hue" +
080                                "12)OD280/OD315 of diluted wines" +
081                                "13)Proline" +
082                                "" +
083                                "In a classification context, this is a well posed problem with \"well behaved\" class structures." +
084                                " A good data set for first testing of a new classifier, but not very challenging. ",
085                                creator = "Forina, M. et al, PARVUS - ",
086                                url = "http://archive.ics.uci.edu/ml/datasets/Wine",
087                                downloadUrls = {
088                                "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
089                })
090public class WineDataset extends MapBackedDataset<Integer, ListDataset<double[]>, double[]> {
091        final static Logger logger = LogManager.getLogger(WineDataset.class);
092
093        /**
094         * Loads the wine dataset, mean centres the dataset
095         * 
096         * @param clusters
097         *            valid clusters, if empty all clusters are chosen
098         */
099        public WineDataset(Integer... clusters) {
100                this(true, clusters);
101        }
102
103        /**
104         * Loads the wine dataset from wine.data
105         * 
106         * @param normalise
107         *            whether to mean center the dataset
108         * @param clusters
109         *            valid clusters, if empty all clusters are chosen
110         */
111        public WineDataset(boolean normalise, Integer... clusters) {
112                final BufferedReader br = new BufferedReader(
113                                new InputStreamReader(WineDataset.class.getResourceAsStream("wine.data")));
114                String line = null;
115                Vector mean = null;
116                Set<Integer> clusterSet = null;
117                if (clusters.length != 0) {
118                        clusterSet = new HashSet<Integer>();
119                        clusterSet.addAll(Arrays.asList(clusters));
120                }
121
122                try {
123                        while ((line = br.readLine()) != null) {
124                                final String[] parts = line.split(",");
125                                final int cluster = Integer.parseInt(parts[0].trim());
126                                if (clusterSet != null && !clusterSet.contains(cluster))
127                                        continue;
128                                final double[] data = new double[parts.length - 1];
129                                for (int i = 0; i < data.length; i++) {
130                                        data[i] = Double.parseDouble(parts[i + 1]);
131                                }
132
133                                ListDataset<double[]> ds = this.get(cluster);
134                                if (ds == null)
135                                        this.put(cluster, ds = new ListBackedDataset<double[]>());
136                                ds.add(data);
137                                final Vector copyArray = VectorFactory.getDefault().copyArray(data);
138                                if (mean == null) {
139                                        mean = copyArray.clone();
140                                }
141                                else {
142                                        mean.plusEquals(copyArray);
143                                }
144                        }
145                        mean.scaleEquals(1. / this.numInstances());
146                        if (normalise) {
147                                normalise(mean);
148                        }
149                } catch (final Exception e) {
150                        logger.error("Wine dataset failed to load", e);
151                }
152        }
153
154        private void normalise(Vector mean) {
155                for (final double[] data : this) {
156                        for (int i = 0; i < data.length; i++) {
157                                data[i] -= mean.getElement(i);
158                        }
159                }
160        }
161}