001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.image.text.extraction.swt;
031
032import java.util.ArrayList;
033import java.util.List;
034import java.util.Set;
035
036import org.openimaj.citation.annotation.Reference;
037import org.openimaj.citation.annotation.ReferenceType;
038import org.openimaj.image.FImage;
039import org.openimaj.image.analyser.ImageAnalyser;
040import org.openimaj.image.pixel.ConnectedComponent;
041import org.openimaj.image.pixel.Pixel;
042import org.openimaj.image.processing.edges.CannyEdgeDetector;
043import org.openimaj.image.processing.edges.StrokeWidthTransform;
044import org.openimaj.image.processing.resize.ResizeProcessor;
045import org.openimaj.util.set.DisjointSetForest;
046
047/**
048 * Implementation of the Stroke Width Transform text detection algorithm by
049 * Epshtein et al.
050 * <p>
051 * This is a (relatively) high-performance text detection technique that does
052 * not require training (except for parameter setting) and is language
053 * independent. The algorithm automatically identifies individual characters
054 * ("letters"), as well as performing word grouping and line segmentation.
055 * <p>
056 * There is an implicit assumption in this implementation that the text is
057 * *almost* horizontal. This implementation cannot be considered to be rotation
058 * invariant. It also has difficulties with curved text.
059 * 
060 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
061 */
062@Reference(
063                type = ReferenceType.Inproceedings,
064                author = { "Epshtein, B.", "Ofek, E.", "Wexler, Y." },
065                title = "Detecting text in natural scenes with stroke width transform",
066                year = "2010",
067                booktitle = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE Conference on",
068                pages = { "2963", "2970" },
069                customData = {
070                                "keywords",
071                                "image processing;text analysis;image operator;image pixel;natural images;natural scenes;stroke width transform;text detection;Colored noise;Computer vision;Engines;Filter bank;Geometry;Image segmentation;Layout;Optical character recognition software;Pixel;Robustness",
072                                "doi", "10.1109/CVPR.2010.5540041",
073                                "ISSN", "1063-6919"
074                })
075public class SWTTextDetector implements ImageAnalyser<FImage> {
076        /**
077         * Text search "directions": Dark text on a lighter background, light text
078         * on a dark background and both.
079         * 
080         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
081         * 
082         */
083        public static enum Direction {
084                /**
085                 * Dark text against a lighter background
086                 */
087                DarkOnLight {
088                        @Override
089                        protected void detect(FImage image, SWTTextDetector detector) {
090                                final StrokeWidthTransform swt = new StrokeWidthTransform(true, detector.options.canny);
091                                swt.setMaxStrokeWidth(detector.options.maxStrokeWidth);
092                                final FImage swtImage = image.process(swt);
093                                detector.analyseImage(image, swtImage);
094                        }
095                },
096                /**
097                 * Light text against a lighter background
098                 */
099                LightOnDark {
100                        @Override
101                        protected void detect(FImage image, SWTTextDetector detector) {
102                                final StrokeWidthTransform swt = new StrokeWidthTransform(false, detector.options.canny);
103                                swt.setMaxStrokeWidth(detector.options.maxStrokeWidth);
104                                final FImage swtImage = image.process(swt);
105                                detector.analyseImage(image, swtImage);
106                        }
107                },
108                /**
109                 * Search for both light and dark text
110                 */
111                Both {
112                        @Override
113                        protected void detect(FImage image, SWTTextDetector detector) {
114                                final StrokeWidthTransform swt = new StrokeWidthTransform(true, detector.options.canny);
115                                swt.setMaxStrokeWidth(detector.options.maxStrokeWidth);
116                                FImage swtImage = image.process(swt);
117                                detector.analyseImage(image, swtImage);
118
119                                swt.setDirection(false);
120                                swtImage = image.process(swt);
121                                detector.analyseImage(image, swtImage);
122                        }
123                };
124
125                protected abstract void detect(FImage image, SWTTextDetector detector);
126        }
127
128        /**
129         * Options for controlling the {@link SWTTextDetector}.
130         * 
131         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
132         */
133        public static class Options {
134
135                /**
136                 * The "direction" to perform the SWT
137                 */
138                public Direction direction = Direction.DarkOnLight;
139
140                /**
141                 * The canny edge detector to use for the SWT.
142                 */
143                public CannyEdgeDetector canny = new CannyEdgeDetector(1);
144
145                /**
146                 * Upscale the image to double size before applying the SWT.
147                 */
148                public boolean doubleSize = false;
149
150                /**
151                 * Maximum allowed ratio of a pair of stroke widths for them to be
152                 * considered part of the same connected component.
153                 */
154                public float strokeWidthRatio = 3.0f;
155
156                /**
157                 * Maximum allowed variance of stroke width in a single character as a
158                 * percentage of the mean.
159                 */
160                public double letterVarianceMean = 0.93;
161
162                /**
163                 * Maximum allowed aspect ratio for a single letter
164                 */
165                public double maxAspectRatio = 10;
166
167                /**
168                 * Maximum allowed ratio of diameter to stroke width for a single
169                 * character.
170                 */
171                public double maxDiameterStrokeRatio = 10;
172
173                /**
174                 * Minimum allowed component size; used to quickly filter out small
175                 * components.
176                 */
177                public int minArea = 38;
178
179                /**
180                 * Minimum character height
181                 */
182                public float minHeight = 10;
183
184                /**
185                 * Maximum character height
186                 */
187                public float maxHeight = 300;
188
189                /**
190                 * Maximum allowed number of overlapping characters
191                 */
192                public int maxNumOverlappingBoxes = 10;
193
194                /**
195                 * Maximum allowed stroke width
196                 */
197                public int maxStrokeWidth = 70;
198
199                /**
200                 * Maximum ratio of stroke width for two letters to be considered to be
201                 * related
202                 */
203                public float medianStrokeWidthRatio = 2;
204
205                /**
206                 * Maximum ratio of height for two letters to be considered to be
207                 * related
208                 */
209                public float letterHeightRatio = 2;
210
211                /**
212                 * Maximum difference in intensity for two letters to be considered to
213                 * be related
214                 */
215                public float intensityThreshold = 0.12f;
216
217                /**
218                 * The width multiplier for two letters to be considered to be related.
219                 * Distance between centroids must be less than widthMultiplier *
220                 * maxLetterWidth.
221                 */
222                public float widthMultiplier = 3;
223
224                /**
225                 * Minimum number of allowed letters on a line
226                 */
227                public int minLettersPerLine = 3;
228
229                /**
230                 * Ratio of vertical intersection for character pairing. This helps
231                 * ensure that the characters are horizontal.
232                 */
233                public float intersectRatio = 1.3f;
234
235                /**
236                 * Ratio of the interclass std dev of the letter spacings to the mean to
237                 * suggest a word break.
238                 */
239                public float wordBreakdownRatio = 1f;
240        }
241
242        /**
243         * 8-connected locations
244         */
245        private final static int[][] connect8 = {
246                        { -1, 0 }, { 1, 0 }, { 0, -1 }, { 0, 1 }, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
247
248        /**
249         * The parameters of the algorithm
250         */
251        protected Options options;
252
253        private List<LetterCandidate> letters = null;
254        private List<LineCandidate> lines = null;
255        private List<WordCandidate> words = null;
256
257        /**
258         * Construct the {@link SWTTextDetector} with the default parameters.
259         */
260        public SWTTextDetector() {
261                this(new Options());
262        }
263
264        /**
265         * Construct the {@link SWTTextDetector} with the given parameters.
266         * 
267         * @param options
268         *            the parameters
269         */
270        public SWTTextDetector(Options options) {
271                this.options = options;
272        }
273
274        /**
275         * Get the current options.
276         * 
277         * @return the current options.
278         */
279        public Options getOptions() {
280                return options;
281        }
282
283        /**
284         * Modified connected component algorithm - uses a modified predicate to
285         * group SWT pixels based on their stroke width ratio.
286         * 
287         * @param image
288         *            swt image
289         * @return the detected components
290         */
291        private List<ConnectedComponent> findComponents(FImage image) {
292                final DisjointSetForest<Pixel> forest = new DisjointSetForest<Pixel>();
293
294                Pixel current = new Pixel();
295                Pixel next = new Pixel();
296                for (int y = 0; y < image.height; y++) {
297                        for (int x = 0; x < image.width; x++) {
298                                final float currentValue = image.pixels[y][x];
299
300                                if (currentValue > 0 && currentValue != Float.POSITIVE_INFINITY) {
301                                        current.x = x;
302                                        current.y = y;
303
304                                        if (forest.makeSet(current) != null)
305                                                current = current.clone();
306
307                                        for (int i = 0; i < connect8.length; i++) {
308                                                final int xx = x + connect8[i][0];
309                                                final int yy = y + connect8[i][1];
310
311                                                if (xx >= 0 && xx < image.width - 1 && yy >= 0 && yy < image.height - 1) {
312                                                        final float value = image.pixels[yy][xx];
313
314                                                        if (value > 0 && value != Float.POSITIVE_INFINITY) {
315                                                                next.x = xx;
316                                                                next.y = yy;
317
318                                                                if (forest.makeSet(next) != null)
319                                                                        next = next.clone();
320
321                                                                if ((Math.max(currentValue, value) / Math.min(currentValue, value)) < options.strokeWidthRatio)
322                                                                        forest.union(current, next);
323                                                        }
324                                                }
325                                        }
326                                }
327                        }
328                }
329
330                final List<ConnectedComponent> components = new ArrayList<ConnectedComponent>();
331                for (final Set<Pixel> pixels : forest.getSubsets()) {
332                        final ConnectedComponent cc = new ConnectedComponent(pixels);
333                        components.add(cc);
334                }
335
336                return components;
337        }
338
339        @Override
340        public void analyseImage(FImage image) {
341                letters = new ArrayList<LetterCandidate>();
342                lines = new ArrayList<LineCandidate>();
343                words = new ArrayList<WordCandidate>();
344
345                if (options.doubleSize)
346                        image = ResizeProcessor.doubleSize(image);
347
348                options.direction.detect(image, this);
349        }
350
351        protected void analyseImage(FImage image, FImage swt) {
352                final List<ConnectedComponent> comps = findComponents(swt);
353                final List<LetterCandidate> tmpLetters = LetterCandidate.findLetters(comps, swt, image, options);
354                final List<LineCandidate> tmpLines = LineCandidate.extractLines(tmpLetters, this.options);
355
356                this.letters.addAll(tmpLetters);
357                this.lines.addAll(tmpLines);
358                for (final LineCandidate line : tmpLines) {
359                        this.words.addAll(line.words);
360                }
361        }
362
363        /**
364         * Get the detected candidate lines of text
365         * 
366         * @return the lines of text
367         */
368        public List<LineCandidate> getLines() {
369                return lines;
370        }
371
372        /**
373         * Get the unfiltered detected characters. Normally you would want to get
374         * the lines and then get the characters from each line as these will be
375         * filtered.
376         * 
377         * @see #getLines()
378         * 
379         * @return the unfiltered characters
380         */
381        public List<LetterCandidate> getLetters() {
382                return letters;
383        }
384}