001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.experiment.dataset.sampling;
031
032import java.util.ArrayList;
033import java.util.Collections;
034import java.util.List;
035
036import org.openimaj.data.dataset.GroupedDataset;
037import org.openimaj.data.dataset.ListDataset;
038import org.openimaj.data.dataset.MapBackedDataset;
039
040/**
041 * Sampler that samples whole groups from a {@link GroupedDataset}. Groups are
042 * either selected randomly or from the first ones returned by the iterator over
043 * the dataset keys.
044 * 
045 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
046 * 
047 * @param <KEY>
048 *            Type of groups
049 * @param <INSTANCE>
050 *            Type of instances
051 */
052public class GroupSampler<KEY, INSTANCE>
053                implements
054                Sampler<GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE>>
055{
056        int numGroups;
057        boolean random;
058
059        /**
060         * Construct the sample to extract the given number of groups, either
061         * randomly or by taking them in the order provided by the iterator of
062         * groups.
063         * 
064         * @param numGroups
065         *            the number of groups
066         * @param random
067         *            should the sample groups be chosen randomly?
068         */
069        public GroupSampler(int numGroups, boolean random) {
070                this.numGroups = numGroups;
071                this.random = random;
072        }
073
074        @Override
075        public GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
076                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset)
077        {
078                final MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample = new MapBackedDataset<KEY, ListDataset<INSTANCE>, INSTANCE>();
079
080                final List<KEY> keys = new ArrayList<KEY>(dataset.getGroups());
081                if (random) {
082                        Collections.shuffle(keys);
083                }
084
085                for (int i = 0; i < numGroups; i++) {
086                        final KEY key = keys.get(i);
087                        sample.add(key, dataset.get(key));
088                }
089
090                return sample;
091        }
092
093        /**
094         * Sample a dataset with the given number of groups to select. Groups are
095         * either selected randomly or from the first ones returned by the iterator
096         * over the dataset keys.
097         * 
098         * @param dataset
099         *            the dataset to sample
100         * @param numGroups
101         *            the number of groups
102         * @param random
103         *            should the sample groups be chosen randomly?
104         * @return the sampled dataset
105         */
106        public static <KEY, INSTANCE> GroupedDataset<KEY, ListDataset<INSTANCE>, INSTANCE> sample(
107                        GroupedDataset<KEY, ? extends ListDataset<INSTANCE>, INSTANCE> dataset, int numGroups,
108                        boolean random)
109        {
110                return new GroupSampler<KEY, INSTANCE>(numGroups, random).sample(dataset);
111        }
112}