001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.experiment.dataset.sampling;
031
032import java.util.List;
033
034import org.openimaj.data.RandomData;
035import org.openimaj.data.dataset.ListBackedDataset;
036import org.openimaj.data.dataset.ListDataset;
037import org.openimaj.experiment.dataset.util.DatasetAdaptors;
038import org.openimaj.util.list.AcceptingListView;
039import org.openimaj.util.list.SkippingListView;
040
041/**
042 * A uniformly random sampling scheme for {@link ListDataset}s. Both sampling
043 * with and without replacement are supported. The sampler returns a "view" on
044 * top of the input dataset that selects a predefined fraction of the data.
045 * 
046 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
047 * 
048 * @param <INSTANCE>
049 *            Type of instances
050 */
051public class UniformRandomisedSampler<INSTANCE> implements Sampler<ListDataset<INSTANCE>> {
052        private boolean withReplacement = false;
053
054        // this is overloaded to hold either a percentage or number of instances.
055        // Percentages are stored in the range 0..1; numbers are stored as -number.
056        private double percentage;
057
058        /**
059         * Construct a {@link UniformRandomisedSampler} with the given percentage of
060         * instances to select. By default, the sampling is without replacement
061         * (i.e. an instance can only be selected once).
062         * 
063         * @param percentage
064         *            percentage of instances to select
065         */
066        public UniformRandomisedSampler(double percentage) {
067                if (percentage < 0 || percentage > 1)
068                        throw new IllegalArgumentException("percentage of sample instances must be between 0 and 1");
069
070                this.percentage = percentage;
071        }
072
073        /**
074         * Construct a {@link UniformRandomisedSampler} with the given percentage of
075         * instances to select, using with with-replacement or without-replacement
076         * sampling.
077         * 
078         * @param percentage
079         *            percentage of instances to select
080         * @param withReplacement
081         *            should the sampling be performed with replacement (true); or
082         *            without replacement (false).
083         */
084        public UniformRandomisedSampler(double percentage, boolean withReplacement) {
085                this(percentage);
086                this.withReplacement = withReplacement;
087        }
088
089        /**
090         * Construct a {@link UniformRandomisedSampler} with the given number of
091         * instances to select. By default, the sampling is without replacement
092         * (i.e. an instance can only be selected once).
093         * 
094         * @param number
095         *            number of instances to select
096         */
097        public UniformRandomisedSampler(int number) {
098                if (number < 1)
099                        throw new IllegalArgumentException("number of sample instances must be bigger than 0");
100
101                this.percentage = -number;
102        }
103
104        /**
105         * Construct a {@link UniformRandomisedSampler} with the given number of
106         * instances to select, using with with-replacement or without-replacement
107         * sampling.
108         * 
109         * @param number
110         *            number of instances to select
111         * @param withReplacement
112         *            should the sampling be performed with replacement (true); or
113         *            without replacement (false).
114         */
115        public UniformRandomisedSampler(int number, boolean withReplacement) {
116                this(number);
117                this.withReplacement = withReplacement;
118        }
119
120        @Override
121        public ListDataset<INSTANCE> sample(ListDataset<INSTANCE> dataset) {
122                final boolean skip;
123                final int N;
124                if (percentage >= 0) {
125                        // if we want more than 50%, it's better to select 1-percentage
126                        // indexes to skip
127                        skip = percentage > 0.5;
128                        final double per = skip ? 1.0 - percentage : percentage;
129
130                        N = (int) Math.round(dataset.size() * per);
131                } else {
132                        N = (int) -percentage;
133                        skip = N > (dataset.size() / 2);
134                }
135
136                int[] selectedIds;
137                if (withReplacement) {
138                        selectedIds = RandomData.getRandomIntArray(N, 0, dataset.size());
139                } else {
140                        selectedIds = RandomData.getUniqueRandomInts(N, 0, dataset.size());
141                }
142
143                final List<INSTANCE> listView = DatasetAdaptors.asList(dataset);
144
145                if (!skip) {
146                        return new ListBackedDataset<INSTANCE>(new AcceptingListView<INSTANCE>(listView, selectedIds));
147                }
148
149                return new ListBackedDataset<INSTANCE>(new SkippingListView<INSTANCE>(listView, selectedIds));
150        }
151}