001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.math.model.fit;
031
032import java.util.ArrayList;
033import java.util.Arrays;
034import java.util.List;
035
036import org.apache.commons.math3.distribution.ChiSquaredDistribution;
037import org.openimaj.citation.annotation.Reference;
038import org.openimaj.citation.annotation.ReferenceType;
039import org.openimaj.math.model.EstimatableModel;
040import org.openimaj.math.model.fit.residuals.ResidualCalculator;
041import org.openimaj.math.util.DoubleArrayStatsUtils;
042import org.openimaj.util.CollectionSampler;
043import org.openimaj.util.UniformSampler;
044import org.openimaj.util.pair.IndependentPair;
045
046/**
047 * Least Median of Squares robust model fitting
048 *
049 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
050 *
051 * @param <I>
052 *            type of independent data
053 * @param <D>
054 *            type of dependent data
055 * @param <M>
056 *            concrete type of model learned
057 */
058@Reference(
059                type = ReferenceType.Article,
060                author = { "Peter J. Rousseeuw" },
061                title = "Least Median of Squares Regression",
062                year = "1984",
063                journal = "Journal of the American Statistical Association",
064                pages = { "871", "", "880" },
065                url = "http://www.jstor.org/stable/2288718",
066                month = "December",
067                number = "388",
068                volume = "79")
069public class LMedS<I, D, M extends EstimatableModel<I, D>> implements RobustModelFitting<I, D, M> {
070        /**
071         * Probability that at least one of the samples drawn is good
072         */
073        double probability = 0.99;
074
075        /**
076         * The level of inlier noise (standard deviation of Gaussian noise inherent
077         * in the data). Set to -1 if unknown.
078         */
079        double inlierNoiseLevel = -1;
080
081        /**
082         * Expected proportion of outliers
083         */
084        double outlierProportion = 0.4;
085
086        /**
087         * Number of degrees of freedom of the model; used in conjunction with
088         * inlierNoiseLevel if set to estimate the inliers.
089         */
090        private double degreesOfFreedom;
091
092        protected ResidualCalculator<I, D, M> residualEstimator;
093
094        protected boolean improveEstimate;
095
096        protected M model;
097        protected M bestModel;
098        protected List<IndependentPair<I, D>> inliers = new ArrayList<IndependentPair<I, D>>();
099        protected List<IndependentPair<I, D>> outliers = new ArrayList<IndependentPair<I, D>>();
100        protected CollectionSampler<IndependentPair<I, D>> sampler;
101        private double bestMedianError;
102
103        /**
104         * Construct with the given model and residual calculator. The proportion of
105         * outliers is assumed to be 0.4. The threshold for determining inliers and
106         * outliers is automatically computed. Uniform random sampling is used for
107         * creating the subsets.
108         *
109         * @param model
110         *            the model to estimate
111         * @param residualEstimator
112         *            the algorithm to compute residuals of the model
113         * @param impEst
114         *            True if we want to perform a final fitting of the model with
115         *            all inliers, false otherwise
116         */
117        public LMedS(M model, ResidualCalculator<I, D, M> residualEstimator, boolean impEst) {
118                this(model, residualEstimator, impEst, new UniformSampler<IndependentPair<I, D>>());
119        }
120
121        /**
122         * Construct with the given model, residual calculator and estimated
123         * proportion of outliers. The threshold for determining inliers and
124         * outliers is automatically computed. Uniform random sampling is used for
125         * creating the subsets.
126         *
127         * @param model
128         *            the model to estimate
129         * @param residualEstimator
130         *            the algorithm to compute residuals of the model
131         * @param outlierProportion
132         *            Expected proportion of outliers
133         * @param impEst
134         *            True if we want to perform a final fitting of the model with
135         *            all inliers, false otherwise
136         */
137        public LMedS(M model, ResidualCalculator<I, D, M> residualEstimator, double outlierProportion, boolean impEst) {
138                this(model, residualEstimator, impEst);
139                this.outlierProportion = outlierProportion;
140        }
141
142        /**
143         * Construct with the given model, residual calculator and estimated
144         * proportion of outliers. The given inlier noise level and number of
145         * degrees of freedom of the model are used to estimate the optimal
146         * threshold for determining inliers versus outliers. Uniform random
147         * sampling is used for creating the subsets.
148         *
149         * @param model
150         *            the model to estimate
151         * @param residualEstimator
152         *            the algorithm to compute residuals of the model
153         * @param outlierProportion
154         *            Expected proportion of outliers
155         * @param inlierNoiseLevel
156         *            The level of inlier noise (standard deviation of Gaussian
157         *            noise inherent in the data).
158         * @param degreesOfFreedom
159         *            Number of degrees of freedom of the model
160         * @param impEst
161         *            True if we want to perform a final fitting of the model with
162         *            all inliers, false otherwise
163         */
164        public LMedS(M model, ResidualCalculator<I, D, M> residualEstimator, double outlierProportion,
165                        double inlierNoiseLevel, double degreesOfFreedom, boolean impEst)
166        {
167                this(model, residualEstimator, outlierProportion, impEst);
168                this.inlierNoiseLevel = inlierNoiseLevel;
169                this.degreesOfFreedom = degreesOfFreedom;
170        }
171
172        /**
173         * Construct with the given model and residual calculator. The proportion of
174         * outliers is assumed to be 0.4. The threshold for determining inliers and
175         * outliers is automatically computed.
176         *
177         * @param model
178         *            the model to estimate
179         * @param residualEstimator
180         *            the algorithm to compute residuals of the model
181         * @param impEst
182         *            True if we want to perform a final fitting of the model with
183         *            all inliers, false otherwise
184         * @param sampler
185         *            the sampling algorithm for selecting random subsets
186         */
187        @SuppressWarnings("unchecked")
188        public LMedS(M model, ResidualCalculator<I, D, M> residualEstimator, boolean impEst,
189                        CollectionSampler<IndependentPair<I, D>> sampler)
190        {
191                this.model = model;
192                this.residualEstimator = residualEstimator;
193                this.bestModel = (M) model.clone();
194                this.improveEstimate = impEst;
195                this.sampler = sampler;
196        }
197
198        /**
199         * Construct with the given model, residual calculator and estimated
200         * proportion of outliers. The threshold for determining inliers and
201         * outliers is automatically computed.
202         *
203         * @param model
204         *            the model to estimate
205         * @param residualEstimator
206         *            the algorithm to compute residuals of the model
207         * @param outlierProportion
208         *            Expected proportion of outliers
209         * @param impEst
210         *            True if we want to perform a final fitting of the model with
211         *            all inliers, false otherwise
212         * @param sampler
213         *            the sampling algorithm for selecting random subsets
214         */
215        public LMedS(M model, ResidualCalculator<I, D, M> residualEstimator, double outlierProportion, boolean impEst,
216                        CollectionSampler<IndependentPair<I, D>> sampler)
217        {
218                this(model, residualEstimator, impEst, sampler);
219                this.outlierProportion = outlierProportion;
220        }
221
222        /**
223         * Construct with the given model, residual calculator and estimated
224         * proportion of outliers. The given inlier noise level and number of
225         * degrees of freedom of the model are used to estimate the optimal
226         * threshold for determining inliers versus outliers.
227         *
228         * @param model
229         *            the model to estimate
230         * @param residualEstimator
231         *            the algorithm to compute residuals of the model
232         * @param outlierProportion
233         *            Expected proportion of outliers
234         * @param inlierNoiseLevel
235         *            The level of inlier noise (standard deviation of Gaussian
236         *            noise inherent in the data).
237         * @param degreesOfFreedom
238         *            Number of degrees of freedom of the model
239         * @param impEst
240         *            True if we want to perform a final fitting of the model with
241         *            all inliers, false otherwise
242         * @param sampler
243         *            the sampling algorithm for selecting random subsets
244         */
245        public LMedS(M model, ResidualCalculator<I, D, M> residualEstimator, double outlierProportion,
246                        double inlierNoiseLevel, double degreesOfFreedom, boolean impEst,
247                        CollectionSampler<IndependentPair<I, D>> sampler)
248        {
249                this(model, residualEstimator, outlierProportion, impEst, sampler);
250                this.inlierNoiseLevel = inlierNoiseLevel;
251                this.degreesOfFreedom = degreesOfFreedom;
252        }
253
254        @Override
255        public boolean fitData(List<? extends IndependentPair<I, D>> data) {
256                final int sampleSize = model.numItemsToEstimate();
257
258                if (data.size() < sampleSize)
259                        return false;
260
261                final int numSamples = (int) Math.ceil(Math.log(1 - probability)
262                                / Math.log(1 - Math.pow(1 - outlierProportion, sampleSize)));
263
264                double[] errors = new double[data.size()];
265                double[] bestErrors = new double[data.size()];
266                Arrays.fill(bestErrors, Double.MAX_VALUE);
267                bestMedianError = Double.MAX_VALUE;
268
269                sampler.setCollection(data);
270
271                for (int i = 0; i < numSamples; i++) {
272                        final List<? extends IndependentPair<I, D>> sample = sampler.sample(sampleSize);
273
274                        if (!model.estimate(sample))
275                                continue;
276
277                        residualEstimator.setModel(model);
278                        residualEstimator.computeResiduals(data, errors);
279
280                        final double medianError = DoubleArrayStatsUtils.median(errors);
281                        if (medianError < bestMedianError) {
282                                bestMedianError = medianError;
283
284                                // swap working model and best model
285                                final M tmp = bestModel;
286                                bestModel = model;
287                                model = tmp;
288
289                                final double[] tmp2 = bestErrors;
290                                bestErrors = errors;
291                                errors = tmp2;
292                        }
293                }
294
295                findInliersOutliers(data, bestErrors);
296
297                if (improveEstimate) {
298                        if (!bestModel.estimate(inliers))
299                                return false;
300                }
301
302                final double outlierProp = (double) outliers.size() / (double) data.size();
303
304                return outlierProp < this.outlierProportion;
305        }
306
307        private void findInliersOutliers(List<? extends IndependentPair<I, D>> data, double[] bestErrors) {
308                inliers.clear();
309                outliers.clear();
310
311                double threshold;
312                if (inlierNoiseLevel > 0)
313                        threshold = inlierNoiseLevel * inlierNoiseLevel
314                                        * new ChiSquaredDistribution(degreesOfFreedom).inverseCumulativeProbability(probability);
315                else {
316                        final double sigmahat = 1.4826 * (1 + 5 / (Math.max(1, data.size() - model.numItemsToEstimate())))
317                                        * Math.sqrt(bestMedianError);
318                        // http://research.microsoft.com/en-us/um/people/zhang/INRIA/Publis/Tutorial-Estim/node25.html
319                        threshold = (2.5 * sigmahat) * (2.5 * sigmahat);
320                }
321
322                for (int i = 0; i < data.size(); i++) {
323                        if (bestErrors[i] < threshold)
324                                inliers.add(data.get(i));
325                        else
326                                outliers.add(data.get(i));
327                }
328        }
329
330        @Override
331        public M getModel() {
332                return bestModel;
333        }
334
335        @Override
336        public List<? extends IndependentPair<I, D>> getInliers() {
337                return inliers;
338        }
339
340        @Override
341        public List<? extends IndependentPair<I, D>> getOutliers() {
342                return outliers;
343        }
344
345        @Override
346        public int numItemsToEstimate() {
347                return model.numItemsToEstimate();
348        }
349}