001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.experiment.dataset.sampling;
031
032import java.util.Map;
033
034import org.openimaj.data.RandomData;
035import org.openimaj.data.dataset.GroupedDataset;
036import org.openimaj.data.dataset.ListBackedDataset;
037import org.openimaj.data.dataset.ListDataset;
038import org.openimaj.data.dataset.MapBackedDataset;
039import org.openimaj.util.pair.IndependentPair;
040
041/**
042 * A uniformly random sampling scheme for grouped datasets. Both sampling with
043 * and without replacement are supported. The sampler returns a dataset that
044 * selects a predefined fraction of the input data. No attempt is made to ensure
045 * that the distribution across groups is maintained (see
046 * {@link StratifiedGroupedUniformRandomisedSampler} to achieve that).
047 * 
048 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
049 * 
050 * @param <KEY>
051 *            Type of groups
052 * @param <INSTANCE>
053 *            Type of instances
054 */
055public class GroupedUniformRandomisedSampler<KEY, INSTANCE>
056                implements
057                Sampler<GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE>>
058{
059        private boolean withReplacement = false;
060
061        // this is overloaded to hold either a percentage or number of instances.
062        // Percentages are stored in the range 0..1; numbers are stored as -number.
063        private double percentage;
064
065        /**
066         * Construct a {@link GroupedUniformRandomisedSampler} with the given
067         * percentage of instances to select. By default, the sampling is without
068         * replacement (i.e. an instance can only be selected once).
069         * 
070         * @param percentage
071         *            percentage of instances to select
072         */
073        public GroupedUniformRandomisedSampler(double percentage) {
074                if (percentage < 0 || percentage > 1)
075                        throw new IllegalArgumentException("percentage of sample instances must be between 0 and 1");
076
077                this.percentage = percentage;
078        }
079
080        /**
081         * Construct a {@link GroupedUniformRandomisedSampler} with the given
082         * percentage of instances to select, using with with-replacement or
083         * without-replacement sampling.
084         * 
085         * @param percentage
086         *            percentage of instances to select
087         * @param withReplacement
088         *            should the sampling be performed with replacement (true); or
089         *            without replacement (false).
090         */
091        public GroupedUniformRandomisedSampler(double percentage,
092                        boolean withReplacement)
093        {
094                this(percentage);
095                this.withReplacement = withReplacement;
096        }
097
098        /**
099         * Construct a {@link GroupedUniformRandomisedSampler} with the given number
100         * of instances to select. By default, the sampling is without replacement
101         * (i.e. an instance can only be selected once).
102         * 
103         * @param number
104         *            number of instances to select
105         */
106        public GroupedUniformRandomisedSampler(int number) {
107                if (number < 1)
108                        throw new IllegalArgumentException("number of sample instances must be greater than 0");
109
110                this.percentage = -number;
111        }
112
113        /**
114         * Construct a {@link GroupedUniformRandomisedSampler} with the given number
115         * of instances to select, using with with-replacement or
116         * without-replacement sampling.
117         * 
118         * @param number
119         *            number of instances to select
120         * @param withReplacement
121         *            should the sampling be performed with replacement (true); or
122         *            without replacement (false).
123         */
124        public GroupedUniformRandomisedSampler(int number,
125                        boolean withReplacement)
126        {
127                this(number);
128                this.withReplacement = withReplacement;
129        }
130
131        @Override
132        public GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
133                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset)
134        {
135                final int N;
136                if (percentage >= 0) {
137                        N = (int) Math.round(dataset.numInstances() * percentage);
138                } else {
139                        N = (int) -percentage;
140                }
141
142                int[] selectedIds;
143                if (withReplacement) {
144                        selectedIds = RandomData.getRandomIntArray(N, 0, dataset.numInstances());
145                } else {
146                        selectedIds = RandomData.getUniqueRandomInts(N, 0, dataset.numInstances());
147                }
148
149                final MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample = new MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE>();
150                final Map<KEY, ListDataset<INSTANCE>> map = sample.getMap();
151
152                for (int i = 0; i < N; i++) {
153                        final IndependentPair<KEY, INSTANCE> p = select(selectedIds[i], dataset);
154
155                        ListBackedDataset<INSTANCE> lbd = (ListBackedDataset<INSTANCE>) map
156                                        .get(p.firstObject());
157                        if (lbd == null)
158                                map.put(p.firstObject(),
159                                                lbd = new ListBackedDataset<INSTANCE>());
160                        lbd.add(p.getSecondObject());
161                }
162
163                return sample;
164        }
165
166        private IndependentPair<KEY, INSTANCE> select(int idx,
167                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset)
168        {
169                for (final KEY k : dataset.getGroups()) {
170                        final ListDataset<INSTANCE> instances = dataset.getInstances(k);
171                        final int sz = instances.size();
172
173                        if (idx < sz) {
174                                return new IndependentPair<KEY, INSTANCE>(k,
175                                                instances.getInstance(idx));
176                        }
177                        idx -= sz;
178                }
179
180                return null;
181        }
182
183        /**
184         * Sample a dataset with the given percentage of instances to select. By
185         * default, the sampling is without replacement (i.e. an instance can only
186         * be selected once).
187         * 
188         * @param dataset
189         *            the dataset to sample
190         * @param percentage
191         *            percentage of instances to select
192         * @return the sampled dataset
193         */
194        public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
195                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, double percentage)
196        {
197                return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(percentage).sample(dataset);
198        }
199
200        /**
201         * Sample a dataset with the given percentage of instances to select, using
202         * with with-replacement or without-replacement sampling.
203         * 
204         * @param dataset
205         *            the dataset to sample
206         * @param percentage
207         *            percentage of instances to select
208         * @param withReplacement
209         *            should the sampling be performed with replacement (true); or
210         *            without replacement (false).
211         * @return the sampled dataset
212         */
213        public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
214                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, double percentage,
215                        boolean withReplacement)
216        {
217                return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(percentage, withReplacement).sample(dataset);
218        }
219
220        /**
221         * Sample a dataset with the given number of instances to select. By
222         * default, the sampling is without replacement (i.e. an instance can only
223         * be selected once).
224         * 
225         * @param dataset
226         *            the dataset to sample
227         * @param number
228         *            number of instances to select
229         * @return the sampled dataset
230         */
231        public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
232                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, int number)
233        {
234                return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(number).sample(dataset);
235        }
236
237        /**
238         * Sample a dataset with the given number of instances to select, using with
239         * with-replacement or without-replacement sampling.
240         * 
241         * @param dataset
242         *            the dataset to sample
243         * @param number
244         *            number of instances to select
245         * @param withReplacement
246         *            should the sampling be performed with replacement (true); or
247         *            without replacement (false).
248         * @return the sampled dataset
249         */
250        public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
251                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, int number,
252                        boolean withReplacement)
253        {
254                return new GroupedUniformRandomisedSampler<KEY, INSTANCE>(number, withReplacement).sample(dataset);
255        }
256}