001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.experiment.dataset.sampling; 031 032import java.util.Map; 033 034import org.openimaj.data.RandomData; 035import org.openimaj.data.dataset.GroupedDataset; 036import org.openimaj.data.dataset.ListBackedDataset; 037import org.openimaj.data.dataset.ListDataset; 038import org.openimaj.data.dataset.MapBackedDataset; 039import org.openimaj.util.pair.IndependentPair; 040 041/** 042 * A uniformly random sampling scheme for grouped datasets. Both sampling with 043 * and without replacement are supported. The sampler returns a dataset that 044 * selects a predefined fraction of the input data. No attempt is made to ensure 045 * that the distribution across groups is maintained (see 046 * {@link StratifiedGroupedUniformRandomisedSampler} to achieve that). 047 * 048 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 049 * 050 * @param <KEY> 051 * Type of groups 052 * @param <INSTANCE> 053 * Type of instances 054 */ 055public class GroupedUniformRandomisedSampler<KEY, INSTANCE> 056 implements 057 Sampler<GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE>> 058{ 059 private boolean withReplacement = false; 060 061 // this is overloaded to hold either a percentage or number of instances. 062 // Percentages are stored in the range 0..1; numbers are stored as -number. 063 private double percentage; 064 065 /** 066 * Construct a {@link GroupedUniformRandomisedSampler} with the given 067 * percentage of instances to select. By default, the sampling is without 068 * replacement (i.e. an instance can only be selected once). 069 * 070 * @param percentage 071 * percentage of instances to select 072 */ 073 public GroupedUniformRandomisedSampler(double percentage) { 074 if (percentage < 0 || percentage > 1) 075 throw new IllegalArgumentException("percentage of sample instances must be between 0 and 1"); 076 077 this.percentage = percentage; 078 } 079 080 /** 081 * Construct a {@link GroupedUniformRandomisedSampler} with the given 082 * percentage of instances to select, using with with-replacement or 083 * without-replacement sampling. 084 * 085 * @param percentage 086 * percentage of instances to select 087 * @param withReplacement 088 * should the sampling be performed with replacement (true); or 089 * without replacement (false). 090 */ 091 public GroupedUniformRandomisedSampler(double percentage, 092 boolean withReplacement) 093 { 094 this(percentage); 095 this.withReplacement = withReplacement; 096 } 097 098 /** 099 * Construct a {@link GroupedUniformRandomisedSampler} with the given number 100 * of instances to select. By default, the sampling is without replacement 101 * (i.e. an instance can only be selected once). 102 * 103 * @param number 104 * number of instances to select 105 */ 106 public GroupedUniformRandomisedSampler(int number) { 107 if (number < 1) 108 throw new IllegalArgumentException("number of sample instances must be greater than 0"); 109 110 this.percentage = -number; 111 } 112 113 /** 114 * Construct a {@link GroupedUniformRandomisedSampler} with the given number 115 * of instances to select, using with with-replacement or 116 * without-replacement sampling. 117 * 118 * @param number 119 * number of instances to select 120 * @param withReplacement 121 * should the sampling be performed with replacement (true); or 122 * without replacement (false). 123 */ 124 public GroupedUniformRandomisedSampler(int number, 125 boolean withReplacement) 126 { 127 this(number); 128 this.withReplacement = withReplacement; 129 } 130 131 @Override 132 public GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample( 133 GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset) 134 { 135 final int N; 136 if (percentage >= 0) { 137 N = (int) Math.round(dataset.numInstances() * percentage); 138 } else { 139 N = (int) -percentage; 140 } 141 142 int[] selectedIds; 143 if (withReplacement) { 144 selectedIds = RandomData.getRandomIntArray(N, 0, dataset.numInstances()); 145 } else { 146 selectedIds = RandomData.getUniqueRandomInts(N, 0, dataset.numInstances()); 147 } 148 149 final MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample = new MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE>(); 150 final Map<KEY, ListDataset<INSTANCE>> map = sample.getMap(); 151 152 for (int i = 0; i < N; i++) { 153 final IndependentPair<KEY, INSTANCE> p = select(selectedIds[i], dataset); 154 155 ListBackedDataset<INSTANCE> lbd = (ListBackedDataset<INSTANCE>) map 156 .get(p.firstObject()); 157 if (lbd == null) 158 map.put(p.firstObject(), 159 lbd = new ListBackedDataset<INSTANCE>()); 160 lbd.add(p.getSecondObject()); 161 } 162 163 return sample; 164 } 165 166 private IndependentPair<KEY, INSTANCE> select(int idx, 167 GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset) 168 { 169 for (final KEY k : dataset.getGroups()) { 170 final ListDataset<INSTANCE> instances = dataset.getInstances(k); 171 final int sz = instances.size(); 172 173 if (idx < sz) { 174 return new IndependentPair<KEY, INSTANCE>(k, 175 instances.getInstance(idx)); 176 } 177 idx -= sz; 178 } 179 180 return null; 181 } 182 183 /** 184 * Sample a dataset with the given percentage of instances to select. By 185 * default, the sampling is without replacement (i.e. an instance can only 186 * be selected once). 187 * 188 * @param dataset 189 * the dataset to sample 190 * @param percentage 191 * percentage of instances to select 192 * @return the sampled dataset 193 */ 194 public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample( 195 GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, double percentage) 196 { 197 return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(percentage).sample(dataset); 198 } 199 200 /** 201 * Sample a dataset with the given percentage of instances to select, using 202 * with with-replacement or without-replacement sampling. 203 * 204 * @param dataset 205 * the dataset to sample 206 * @param percentage 207 * percentage of instances to select 208 * @param withReplacement 209 * should the sampling be performed with replacement (true); or 210 * without replacement (false). 211 * @return the sampled dataset 212 */ 213 public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample( 214 GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, double percentage, 215 boolean withReplacement) 216 { 217 return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(percentage, withReplacement).sample(dataset); 218 } 219 220 /** 221 * Sample a dataset with the given number of instances to select. By 222 * default, the sampling is without replacement (i.e. an instance can only 223 * be selected once). 224 * 225 * @param dataset 226 * the dataset to sample 227 * @param number 228 * number of instances to select 229 * @return the sampled dataset 230 */ 231 public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample( 232 GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, int number) 233 { 234 return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(number).sample(dataset); 235 } 236 237 /** 238 * Sample a dataset with the given number of instances to select, using with 239 * with-replacement or without-replacement sampling. 240 * 241 * @param dataset 242 * the dataset to sample 243 * @param number 244 * number of instances to select 245 * @param withReplacement 246 * should the sampling be performed with replacement (true); or 247 * without replacement (false). 248 * @return the sampled dataset 249 */ 250 public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample( 251 GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, int number, 252 boolean withReplacement) 253 { 254 return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(number, withReplacement).sample(dataset); 255 } 256}