View Javadoc

1   /**
2    * Copyright (c) 2011, The University of Southampton and the individual contributors.
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without modification,
6    * are permitted provided that the following conditions are met:
7    *
8    *   * 	Redistributions of source code must retain the above copyright notice,
9    * 	this list of conditions and the following disclaimer.
10   *
11   *   *	Redistributions in binary form must reproduce the above copyright notice,
12   * 	this list of conditions and the following disclaimer in the documentation
13   * 	and/or other materials provided with the distribution.
14   *
15   *   *	Neither the name of the University of Southampton nor the names of its
16   * 	contributors may be used to endorse or promote products derived from this
17   * 	software without specific prior written permission.
18   *
19   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22   * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29   */
30  package org.openimaj.image.text.extraction.swt;
31  
32  import java.util.ArrayList;
33  import java.util.List;
34  import java.util.Set;
35  
36  import org.openimaj.citation.annotation.Reference;
37  import org.openimaj.citation.annotation.ReferenceType;
38  import org.openimaj.image.FImage;
39  import org.openimaj.image.analyser.ImageAnalyser;
40  import org.openimaj.image.pixel.ConnectedComponent;
41  import org.openimaj.image.pixel.Pixel;
42  import org.openimaj.image.processing.edges.CannyEdgeDetector;
43  import org.openimaj.image.processing.edges.StrokeWidthTransform;
44  import org.openimaj.image.processing.resize.ResizeProcessor;
45  import org.openimaj.util.set.DisjointSetForest;
46  
47  /**
48   * Implementation of the Stroke Width Transform text detection algorithm by
49   * Epshtein et al.
50   * <p>
51   * This is a (relatively) high-performance text detection technique that does
52   * not require training (except for parameter setting) and is language
53   * independent. The algorithm automatically identifies individual characters
54   * ("letters"), as well as performing word grouping and line segmentation.
55   * <p>
56   * There is an implicit assumption in this implementation that the text is
57   * *almost* horizontal. This implementation cannot be considered to be rotation
58   * invariant. It also has difficulties with curved text.
59   * 
60   * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
61   */
62  @Reference(
63  		type = ReferenceType.Inproceedings,
64  		author = { "Epshtein, B.", "Ofek, E.", "Wexler, Y." },
65  		title = "Detecting text in natural scenes with stroke width transform",
66  		year = "2010",
67  		booktitle = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE Conference on",
68  		pages = { "2963", "2970" },
69  		customData = {
70  				"keywords",
71  				"image processing;text analysis;image operator;image pixel;natural images;natural scenes;stroke width transform;text detection;Colored noise;Computer vision;Engines;Filter bank;Geometry;Image segmentation;Layout;Optical character recognition software;Pixel;Robustness",
72  				"doi", "10.1109/CVPR.2010.5540041",
73  				"ISSN", "1063-6919"
74  		})
75  public class SWTTextDetector implements ImageAnalyser<FImage> {
76  	/**
77  	 * Text search "directions": Dark text on a lighter background, light text
78  	 * on a dark background and both.
79  	 * 
80  	 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
81  	 * 
82  	 */
83  	public static enum Direction {
84  		/**
85  		 * Dark text against a lighter background
86  		 */
87  		DarkOnLight {
88  			@Override
89  			protected void detect(FImage image, SWTTextDetector detector) {
90  				final StrokeWidthTransform swt = new StrokeWidthTransform(true, detector.options.canny);
91  				swt.setMaxStrokeWidth(detector.options.maxStrokeWidth);
92  				final FImage swtImage = image.process(swt);
93  				detector.analyseImage(image, swtImage);
94  			}
95  		},
96  		/**
97  		 * Light text against a lighter background
98  		 */
99  		LightOnDark {
100 			@Override
101 			protected void detect(FImage image, SWTTextDetector detector) {
102 				final StrokeWidthTransform swt = new StrokeWidthTransform(false, detector.options.canny);
103 				swt.setMaxStrokeWidth(detector.options.maxStrokeWidth);
104 				final FImage swtImage = image.process(swt);
105 				detector.analyseImage(image, swtImage);
106 			}
107 		},
108 		/**
109 		 * Search for both light and dark text
110 		 */
111 		Both {
112 			@Override
113 			protected void detect(FImage image, SWTTextDetector detector) {
114 				final StrokeWidthTransform swt = new StrokeWidthTransform(true, detector.options.canny);
115 				swt.setMaxStrokeWidth(detector.options.maxStrokeWidth);
116 				FImage swtImage = image.process(swt);
117 				detector.analyseImage(image, swtImage);
118 
119 				swt.setDirection(false);
120 				swtImage = image.process(swt);
121 				detector.analyseImage(image, swtImage);
122 			}
123 		};
124 
125 		protected abstract void detect(FImage image, SWTTextDetector detector);
126 	}
127 
128 	/**
129 	 * Options for controlling the {@link SWTTextDetector}.
130 	 * 
131 	 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
132 	 */
133 	public static class Options {
134 
135 		/**
136 		 * The "direction" to perform the SWT
137 		 */
138 		public Direction direction = Direction.DarkOnLight;
139 
140 		/**
141 		 * The canny edge detector to use for the SWT.
142 		 */
143 		public CannyEdgeDetector canny = new CannyEdgeDetector(1);
144 
145 		/**
146 		 * Upscale the image to double size before applying the SWT.
147 		 */
148 		public boolean doubleSize = false;
149 
150 		/**
151 		 * Maximum allowed ratio of a pair of stroke widths for them to be
152 		 * considered part of the same connected component.
153 		 */
154 		public float strokeWidthRatio = 3.0f;
155 
156 		/**
157 		 * Maximum allowed variance of stroke width in a single character as a
158 		 * percentage of the mean.
159 		 */
160 		public double letterVarianceMean = 0.93;
161 
162 		/**
163 		 * Maximum allowed aspect ratio for a single letter
164 		 */
165 		public double maxAspectRatio = 10;
166 
167 		/**
168 		 * Maximum allowed ratio of diameter to stroke width for a single
169 		 * character.
170 		 */
171 		public double maxDiameterStrokeRatio = 10;
172 
173 		/**
174 		 * Minimum allowed component size; used to quickly filter out small
175 		 * components.
176 		 */
177 		public int minArea = 38;
178 
179 		/**
180 		 * Minimum character height
181 		 */
182 		public float minHeight = 10;
183 
184 		/**
185 		 * Maximum character height
186 		 */
187 		public float maxHeight = 300;
188 
189 		/**
190 		 * Maximum allowed number of overlapping characters
191 		 */
192 		public int maxNumOverlappingBoxes = 10;
193 
194 		/**
195 		 * Maximum allowed stroke width
196 		 */
197 		public int maxStrokeWidth = 70;
198 
199 		/**
200 		 * Maximum ratio of stroke width for two letters to be considered to be
201 		 * related
202 		 */
203 		public float medianStrokeWidthRatio = 2;
204 
205 		/**
206 		 * Maximum ratio of height for two letters to be considered to be
207 		 * related
208 		 */
209 		public float letterHeightRatio = 2;
210 
211 		/**
212 		 * Maximum difference in intensity for two letters to be considered to
213 		 * be related
214 		 */
215 		public float intensityThreshold = 0.12f;
216 
217 		/**
218 		 * The width multiplier for two letters to be considered to be related.
219 		 * Distance between centroids must be less than widthMultiplier *
220 		 * maxLetterWidth.
221 		 */
222 		public float widthMultiplier = 3;
223 
224 		/**
225 		 * Minimum number of allowed letters on a line
226 		 */
227 		public int minLettersPerLine = 3;
228 
229 		/**
230 		 * Ratio of vertical intersection for character pairing. This helps
231 		 * ensure that the characters are horizontal.
232 		 */
233 		public float intersectRatio = 1.3f;
234 
235 		/**
236 		 * Ratio of the interclass std dev of the letter spacings to the mean to
237 		 * suggest a word break.
238 		 */
239 		public float wordBreakdownRatio = 1f;
240 	}
241 
242 	/**
243 	 * 8-connected locations
244 	 */
245 	private final static int[][] connect8 = {
246 			{ -1, 0 }, { 1, 0 }, { 0, -1 }, { 0, 1 }, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } };
247 
248 	/**
249 	 * The parameters of the algorithm
250 	 */
251 	protected Options options;
252 
253 	private List<LetterCandidate> letters = null;
254 	private List<LineCandidate> lines = null;
255 	private List<WordCandidate> words = null;
256 
257 	/**
258 	 * Construct the {@link SWTTextDetector} with the default parameters.
259 	 */
260 	public SWTTextDetector() {
261 		this(new Options());
262 	}
263 
264 	/**
265 	 * Construct the {@link SWTTextDetector} with the given parameters.
266 	 * 
267 	 * @param options
268 	 *            the parameters
269 	 */
270 	public SWTTextDetector(Options options) {
271 		this.options = options;
272 	}
273 
274 	/**
275 	 * Get the current options.
276 	 * 
277 	 * @return the current options.
278 	 */
279 	public Options getOptions() {
280 		return options;
281 	}
282 
283 	/**
284 	 * Modified connected component algorithm - uses a modified predicate to
285 	 * group SWT pixels based on their stroke width ratio.
286 	 * 
287 	 * @param image
288 	 *            swt image
289 	 * @return the detected components
290 	 */
291 	private List<ConnectedComponent> findComponents(FImage image) {
292 		final DisjointSetForest<Pixel> forest = new DisjointSetForest<Pixel>();
293 
294 		Pixel current = new Pixel();
295 		Pixel next = new Pixel();
296 		for (int y = 0; y < image.height; y++) {
297 			for (int x = 0; x < image.width; x++) {
298 				final float currentValue = image.pixels[y][x];
299 
300 				if (currentValue > 0 && currentValue != Float.POSITIVE_INFINITY) {
301 					current.x = x;
302 					current.y = y;
303 
304 					if (forest.makeSet(current) != null)
305 						current = current.clone();
306 
307 					for (int i = 0; i < connect8.length; i++) {
308 						final int xx = x + connect8[i][0];
309 						final int yy = y + connect8[i][1];
310 
311 						if (xx >= 0 && xx < image.width - 1 && yy >= 0 && yy < image.height - 1) {
312 							final float value = image.pixels[yy][xx];
313 
314 							if (value > 0 && value != Float.POSITIVE_INFINITY) {
315 								next.x = xx;
316 								next.y = yy;
317 
318 								if (forest.makeSet(next) != null)
319 									next = next.clone();
320 
321 								if ((Math.max(currentValue, value) / Math.min(currentValue, value)) < options.strokeWidthRatio)
322 									forest.union(current, next);
323 							}
324 						}
325 					}
326 				}
327 			}
328 		}
329 
330 		final List<ConnectedComponent> components = new ArrayList<ConnectedComponent>();
331 		for (final Set<Pixel> pixels : forest.getSubsets()) {
332 			final ConnectedComponent cc = new ConnectedComponent(pixels);
333 			components.add(cc);
334 		}
335 
336 		return components;
337 	}
338 
339 	@Override
340 	public void analyseImage(FImage image) {
341 		letters = new ArrayList<LetterCandidate>();
342 		lines = new ArrayList<LineCandidate>();
343 		words = new ArrayList<WordCandidate>();
344 
345 		if (options.doubleSize)
346 			image = ResizeProcessor.doubleSize(image);
347 
348 		options.direction.detect(image, this);
349 	}
350 
351 	protected void analyseImage(FImage image, FImage swt) {
352 		final List<ConnectedComponent> comps = findComponents(swt);
353 		final List<LetterCandidate> tmpLetters = LetterCandidate.findLetters(comps, swt, image, options);
354 		final List<LineCandidate> tmpLines = LineCandidate.extractLines(tmpLetters, this.options);
355 
356 		this.letters.addAll(tmpLetters);
357 		this.lines.addAll(tmpLines);
358 		for (final LineCandidate line : tmpLines) {
359 			this.words.addAll(line.words);
360 		}
361 	}
362 
363 	/**
364 	 * Get the detected candidate lines of text
365 	 * 
366 	 * @return the lines of text
367 	 */
368 	public List<LineCandidate> getLines() {
369 		return lines;
370 	}
371 
372 	/**
373 	 * Get the unfiltered detected characters. Normally you would want to get
374 	 * the lines and then get the characters from each line as these will be
375 	 * filtered.
376 	 * 
377 	 * @see #getLines()
378 	 * 
379 	 * @return the unfiltered characters
380 	 */
381 	public List<LetterCandidate> getLetters() {
382 		return letters;
383 	}
384 }