001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.experiment.dataset.sampling;
031
032import java.util.List;
033import java.util.Map;
034
035import org.openimaj.data.RandomData;
036import org.openimaj.data.dataset.GroupedDataset;
037import org.openimaj.data.dataset.ListBackedDataset;
038import org.openimaj.data.dataset.ListDataset;
039import org.openimaj.data.dataset.MapBackedDataset;
040import org.openimaj.experiment.dataset.util.DatasetAdaptors;
041import org.openimaj.util.list.AcceptingListView;
042import org.openimaj.util.list.SkippingListView;
043
044/**
045 * A stratified uniformly random sampling scheme for grouped datasets. Both
046 * sampling with and without replacement are supported. The sampler returns a
047 * dataset that selects a predefined fraction of the input data. Specifically,
048 * the given percentage of data is selected from each group independently, thus
049 * ensuring that the distribution of relative group sizes before and after
050 * sampling remains (approximately) constant.
051 * 
052 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
053 * 
054 * @param <KEY>
055 *            Type of groups
056 * @param <INSTANCE>
057 *            Type of instances
058 */
059public class StratifiedGroupedUniformRandomisedSampler<KEY, INSTANCE>
060                implements
061                Sampler<GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE>>
062{
063        private boolean withReplacement = false;
064
065        // this is overloaded to hold either a percentage or number of instances.
066        // Percentages are stored in the range 0..1; numbers are stored as -number.
067        private double percentage;
068
069        /**
070         * Construct a {@link StratifiedGroupedUniformRandomisedSampler}
071         * with the given percentage of instances to select. By default, the
072         * sampling is without replacement (i.e. an instance can only be selected
073         * once).
074         * 
075         * @param percentage
076         *            percentage of instances to select
077         */
078        public StratifiedGroupedUniformRandomisedSampler(double percentage) {
079                if (percentage < 0 || percentage > 1)
080                        throw new IllegalArgumentException("percentage of sample instances must be between 0 and 1");
081
082                this.percentage = percentage;
083        }
084
085        /**
086         * Construct a {@link StratifiedGroupedUniformRandomisedSampler}
087         * with the given percentage of instances to select, using with
088         * with-replacement or without-replacement sampling.
089         * 
090         * @param percentage
091         *            percentage of instances to select
092         * @param withReplacement
093         *            should the sampling be performed with replacement (true); or
094         *            without replacement (false).
095         */
096        public StratifiedGroupedUniformRandomisedSampler(
097                        double percentage, boolean withReplacement)
098        {
099                this(percentage);
100                this.withReplacement = withReplacement;
101        }
102
103        /**
104         * Construct a {@link StratifiedGroupedUniformRandomisedSampler}
105         * with the given number of instances to select. By default, the sampling is
106         * without replacement (i.e. an instance can only be selected once).
107         * 
108         * @param number
109         *            number of instances to select
110         */
111        public StratifiedGroupedUniformRandomisedSampler(int number) {
112                if (number < 1)
113                        throw new IllegalArgumentException("number of sample instances must be greater than 1");
114
115                this.percentage = -number;
116        }
117
118        /**
119         * Construct a {@link StratifiedGroupedUniformRandomisedSampler}
120         * with the given number of instances to select, using with with-replacement
121         * or without-replacement sampling.
122         * 
123         * @param number
124         *            number of instances to select
125         * @param withReplacement
126         *            should the sampling be performed with replacement (true); or
127         *            without replacement (false).
128         */
129        public StratifiedGroupedUniformRandomisedSampler(
130                        int number, boolean withReplacement)
131        {
132                this(number);
133                this.withReplacement = withReplacement;
134        }
135
136        @Override
137        public GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
138                        GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> dataset)
139        {
140                final MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample = new MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE>();
141                final Map<KEY, ListDataset<INSTANCE>> map = sample.getMap();
142
143                for (final KEY key : dataset.getGroups()) {
144                        final List<INSTANCE> list = DatasetAdaptors.asList(dataset
145                                        .getInstances(key));
146                        final int size = list.size();
147
148                        final boolean skip;
149                        final int N;
150                        if (percentage >= 0) {
151                                // if we want more than 50%, it's better to select 1-percentage
152                                // indexes to skip
153                                skip = percentage > 0.5;
154                                final double per = skip ? 1.0 - percentage : percentage;
155
156                                N = (int) Math.round(size * per);
157                        } else {
158                                N = (int) -percentage;
159                                skip = N > (size / 2);
160                        }
161
162                        int[] selectedIds;
163                        if (withReplacement) {
164                                selectedIds = RandomData.getRandomIntArray(N, 0, size);
165                        } else {
166                                selectedIds = RandomData.getUniqueRandomInts(N, 0, size);
167                        }
168
169                        if (!skip) {
170                                map.put(key, new ListBackedDataset<INSTANCE>(
171                                                new AcceptingListView<INSTANCE>(list, selectedIds)));
172                        } else {
173                                map.put(key, new ListBackedDataset<INSTANCE>(
174                                                new SkippingListView<INSTANCE>(list, selectedIds)));
175                        }
176                }
177
178                return sample;
179        }
180}