001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.ml.dataset; 031 032import gov.sandia.cognition.math.matrix.Vector; 033import gov.sandia.cognition.math.matrix.VectorFactory; 034 035import java.io.BufferedReader; 036import java.io.InputStreamReader; 037import java.util.Arrays; 038import java.util.HashSet; 039import java.util.Set; 040 041import org.apache.logging.log4j.Logger; 042import org.apache.logging.log4j.LogManager; 043 044import org.openimaj.data.dataset.Dataset; 045import org.openimaj.data.dataset.ListBackedDataset; 046import org.openimaj.data.dataset.ListDataset; 047import org.openimaj.data.dataset.MapBackedDataset; 048import org.openimaj.experiment.annotations.DatasetDescription; 049 050/** 051 * A {@link Dataset} instance of the standard wine clustering experiment found 052 * here: 053 * 054 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 055 * 056 */ 057@DatasetDescription( 058 name = "Wine Data Set", 059 description = "" + 060 "These data are the results of a chemical analysis of wines grown in the same region in " + 061 "Italy but derived from three different cultivars. The analysis determined the quantities " + 062 "of 13 constituents found in each of the three types of wines." + 063 "" + 064 "I think that the initial data set had around 30 variables, but for some reason I only have " + 065 "the 13 dimensional version. I had a list of what the 30 or so variables were, but a.) I lost" + 066 " it, and b.), I would not know which 13 variables are included in the set." + 067 "" + 068 "The attributes are (dontated by Riccardo Leardi, riclea '@' anchem.unige.it )" + 069 "1) Alcohol" + 070 "2) Malic acid" + 071 "3) Ash" + 072 "4) Alcalinity of ash" + 073 "5) Magnesium" + 074 "6) Total phenols" + 075 "7) Flavanoids" + 076 "8) Nonflavanoid phenols" + 077 "9) Proanthocyanins" + 078 "10)Color intensity" + 079 "11)Hue" + 080 "12)OD280/OD315 of diluted wines" + 081 "13)Proline" + 082 "" + 083 "In a classification context, this is a well posed problem with \"well behaved\" class structures." + 084 " A good data set for first testing of a new classifier, but not very challenging. ", 085 creator = "Forina, M. et al, PARVUS - ", 086 url = "http://archive.ics.uci.edu/ml/datasets/Wine", 087 downloadUrls = { 088 "http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 089 }) 090public class WineDataset extends MapBackedDataset<Integer, ListDataset<double[]>, double[]> { 091 final static Logger logger = LogManager.getLogger(WineDataset.class); 092 093 /** 094 * Loads the wine dataset, mean centres the dataset 095 * 096 * @param clusters 097 * valid clusters, if empty all clusters are chosen 098 */ 099 public WineDataset(Integer... clusters) { 100 this(true, clusters); 101 } 102 103 /** 104 * Loads the wine dataset from wine.data 105 * 106 * @param normalise 107 * whether to mean center the dataset 108 * @param clusters 109 * valid clusters, if empty all clusters are chosen 110 */ 111 public WineDataset(boolean normalise, Integer... clusters) { 112 final BufferedReader br = new BufferedReader( 113 new InputStreamReader(WineDataset.class.getResourceAsStream("wine.data"))); 114 String line = null; 115 Vector mean = null; 116 Set<Integer> clusterSet = null; 117 if (clusters.length != 0) { 118 clusterSet = new HashSet<Integer>(); 119 clusterSet.addAll(Arrays.asList(clusters)); 120 } 121 122 try { 123 while ((line = br.readLine()) != null) { 124 final String[] parts = line.split(","); 125 final int cluster = Integer.parseInt(parts[0].trim()); 126 if (clusterSet != null && !clusterSet.contains(cluster)) 127 continue; 128 final double[] data = new double[parts.length - 1]; 129 for (int i = 0; i < data.length; i++) { 130 data[i] = Double.parseDouble(parts[i + 1]); 131 } 132 133 ListDataset<double[]> ds = this.get(cluster); 134 if (ds == null) 135 this.put(cluster, ds = new ListBackedDataset<double[]>()); 136 ds.add(data); 137 final Vector copyArray = VectorFactory.getDefault().copyArray(data); 138 if (mean == null) { 139 mean = copyArray.clone(); 140 } 141 else { 142 mean.plusEquals(copyArray); 143 } 144 } 145 mean.scaleEquals(1. / this.numInstances()); 146 if (normalise) { 147 normalise(mean); 148 } 149 } catch (final Exception e) { 150 logger.error("Wine dataset failed to load", e); 151 } 152 } 153 154 private void normalise(Vector mean) { 155 for (final double[] data : this) { 156 for (int i = 0; i < data.length; i++) { 157 data[i] -= mean.getElement(i); 158 } 159 } 160 } 161}