001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.image.text.extraction.swt; 031 032import java.util.ArrayList; 033import java.util.List; 034import java.util.Set; 035 036import org.openimaj.citation.annotation.Reference; 037import org.openimaj.citation.annotation.ReferenceType; 038import org.openimaj.image.FImage; 039import org.openimaj.image.analyser.ImageAnalyser; 040import org.openimaj.image.pixel.ConnectedComponent; 041import org.openimaj.image.pixel.Pixel; 042import org.openimaj.image.processing.edges.CannyEdgeDetector; 043import org.openimaj.image.processing.edges.StrokeWidthTransform; 044import org.openimaj.image.processing.resize.ResizeProcessor; 045import org.openimaj.util.set.DisjointSetForest; 046 047/** 048 * Implementation of the Stroke Width Transform text detection algorithm by 049 * Epshtein et al. 050 * <p> 051 * This is a (relatively) high-performance text detection technique that does 052 * not require training (except for parameter setting) and is language 053 * independent. The algorithm automatically identifies individual characters 054 * ("letters"), as well as performing word grouping and line segmentation. 055 * <p> 056 * There is an implicit assumption in this implementation that the text is 057 * *almost* horizontal. This implementation cannot be considered to be rotation 058 * invariant. It also has difficulties with curved text. 059 * 060 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 061 */ 062@Reference( 063 type = ReferenceType.Inproceedings, 064 author = { "Epshtein, B.", "Ofek, E.", "Wexler, Y." }, 065 title = "Detecting text in natural scenes with stroke width transform", 066 year = "2010", 067 booktitle = "Computer Vision and Pattern Recognition (CVPR), 2010 IEEE Conference on", 068 pages = { "2963", "2970" }, 069 customData = { 070 "keywords", 071 "image processing;text analysis;image operator;image pixel;natural images;natural scenes;stroke width transform;text detection;Colored noise;Computer vision;Engines;Filter bank;Geometry;Image segmentation;Layout;Optical character recognition software;Pixel;Robustness", 072 "doi", "10.1109/CVPR.2010.5540041", 073 "ISSN", "1063-6919" 074 }) 075public class SWTTextDetector implements ImageAnalyser<FImage> { 076 /** 077 * Text search "directions": Dark text on a lighter background, light text 078 * on a dark background and both. 079 * 080 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 081 * 082 */ 083 public static enum Direction { 084 /** 085 * Dark text against a lighter background 086 */ 087 DarkOnLight { 088 @Override 089 protected void detect(FImage image, SWTTextDetector detector) { 090 final StrokeWidthTransform swt = new StrokeWidthTransform(true, detector.options.canny); 091 swt.setMaxStrokeWidth(detector.options.maxStrokeWidth); 092 final FImage swtImage = image.process(swt); 093 detector.analyseImage(image, swtImage); 094 } 095 }, 096 /** 097 * Light text against a lighter background 098 */ 099 LightOnDark { 100 @Override 101 protected void detect(FImage image, SWTTextDetector detector) { 102 final StrokeWidthTransform swt = new StrokeWidthTransform(false, detector.options.canny); 103 swt.setMaxStrokeWidth(detector.options.maxStrokeWidth); 104 final FImage swtImage = image.process(swt); 105 detector.analyseImage(image, swtImage); 106 } 107 }, 108 /** 109 * Search for both light and dark text 110 */ 111 Both { 112 @Override 113 protected void detect(FImage image, SWTTextDetector detector) { 114 final StrokeWidthTransform swt = new StrokeWidthTransform(true, detector.options.canny); 115 swt.setMaxStrokeWidth(detector.options.maxStrokeWidth); 116 FImage swtImage = image.process(swt); 117 detector.analyseImage(image, swtImage); 118 119 swt.setDirection(false); 120 swtImage = image.process(swt); 121 detector.analyseImage(image, swtImage); 122 } 123 }; 124 125 protected abstract void detect(FImage image, SWTTextDetector detector); 126 } 127 128 /** 129 * Options for controlling the {@link SWTTextDetector}. 130 * 131 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 132 */ 133 public static class Options { 134 135 /** 136 * The "direction" to perform the SWT 137 */ 138 public Direction direction = Direction.DarkOnLight; 139 140 /** 141 * The canny edge detector to use for the SWT. 142 */ 143 public CannyEdgeDetector canny = new CannyEdgeDetector(1); 144 145 /** 146 * Upscale the image to double size before applying the SWT. 147 */ 148 public boolean doubleSize = false; 149 150 /** 151 * Maximum allowed ratio of a pair of stroke widths for them to be 152 * considered part of the same connected component. 153 */ 154 public float strokeWidthRatio = 3.0f; 155 156 /** 157 * Maximum allowed variance of stroke width in a single character as a 158 * percentage of the mean. 159 */ 160 public double letterVarianceMean = 0.93; 161 162 /** 163 * Maximum allowed aspect ratio for a single letter 164 */ 165 public double maxAspectRatio = 10; 166 167 /** 168 * Maximum allowed ratio of diameter to stroke width for a single 169 * character. 170 */ 171 public double maxDiameterStrokeRatio = 10; 172 173 /** 174 * Minimum allowed component size; used to quickly filter out small 175 * components. 176 */ 177 public int minArea = 38; 178 179 /** 180 * Minimum character height 181 */ 182 public float minHeight = 10; 183 184 /** 185 * Maximum character height 186 */ 187 public float maxHeight = 300; 188 189 /** 190 * Maximum allowed number of overlapping characters 191 */ 192 public int maxNumOverlappingBoxes = 10; 193 194 /** 195 * Maximum allowed stroke width 196 */ 197 public int maxStrokeWidth = 70; 198 199 /** 200 * Maximum ratio of stroke width for two letters to be considered to be 201 * related 202 */ 203 public float medianStrokeWidthRatio = 2; 204 205 /** 206 * Maximum ratio of height for two letters to be considered to be 207 * related 208 */ 209 public float letterHeightRatio = 2; 210 211 /** 212 * Maximum difference in intensity for two letters to be considered to 213 * be related 214 */ 215 public float intensityThreshold = 0.12f; 216 217 /** 218 * The width multiplier for two letters to be considered to be related. 219 * Distance between centroids must be less than widthMultiplier * 220 * maxLetterWidth. 221 */ 222 public float widthMultiplier = 3; 223 224 /** 225 * Minimum number of allowed letters on a line 226 */ 227 public int minLettersPerLine = 3; 228 229 /** 230 * Ratio of vertical intersection for character pairing. This helps 231 * ensure that the characters are horizontal. 232 */ 233 public float intersectRatio = 1.3f; 234 235 /** 236 * Ratio of the interclass std dev of the letter spacings to the mean to 237 * suggest a word break. 238 */ 239 public float wordBreakdownRatio = 1f; 240 } 241 242 /** 243 * 8-connected locations 244 */ 245 private final static int[][] connect8 = { 246 { -1, 0 }, { 1, 0 }, { 0, -1 }, { 0, 1 }, { -1, -1 }, { 1, -1 }, { -1, 1 }, { 1, 1 } }; 247 248 /** 249 * The parameters of the algorithm 250 */ 251 protected Options options; 252 253 private List<LetterCandidate> letters = null; 254 private List<LineCandidate> lines = null; 255 private List<WordCandidate> words = null; 256 257 /** 258 * Construct the {@link SWTTextDetector} with the default parameters. 259 */ 260 public SWTTextDetector() { 261 this(new Options()); 262 } 263 264 /** 265 * Construct the {@link SWTTextDetector} with the given parameters. 266 * 267 * @param options 268 * the parameters 269 */ 270 public SWTTextDetector(Options options) { 271 this.options = options; 272 } 273 274 /** 275 * Get the current options. 276 * 277 * @return the current options. 278 */ 279 public Options getOptions() { 280 return options; 281 } 282 283 /** 284 * Modified connected component algorithm - uses a modified predicate to 285 * group SWT pixels based on their stroke width ratio. 286 * 287 * @param image 288 * swt image 289 * @return the detected components 290 */ 291 private List<ConnectedComponent> findComponents(FImage image) { 292 final DisjointSetForest<Pixel> forest = new DisjointSetForest<Pixel>(); 293 294 Pixel current = new Pixel(); 295 Pixel next = new Pixel(); 296 for (int y = 0; y < image.height; y++) { 297 for (int x = 0; x < image.width; x++) { 298 final float currentValue = image.pixels[y][x]; 299 300 if (currentValue > 0 && currentValue != Float.POSITIVE_INFINITY) { 301 current.x = x; 302 current.y = y; 303 304 if (forest.makeSet(current) != null) 305 current = current.clone(); 306 307 for (int i = 0; i < connect8.length; i++) { 308 final int xx = x + connect8[i][0]; 309 final int yy = y + connect8[i][1]; 310 311 if (xx >= 0 && xx < image.width - 1 && yy >= 0 && yy < image.height - 1) { 312 final float value = image.pixels[yy][xx]; 313 314 if (value > 0 && value != Float.POSITIVE_INFINITY) { 315 next.x = xx; 316 next.y = yy; 317 318 if (forest.makeSet(next) != null) 319 next = next.clone(); 320 321 if ((Math.max(currentValue, value) / Math.min(currentValue, value)) < options.strokeWidthRatio) 322 forest.union(current, next); 323 } 324 } 325 } 326 } 327 } 328 } 329 330 final List<ConnectedComponent> components = new ArrayList<ConnectedComponent>(); 331 for (final Set<Pixel> pixels : forest.getSubsets()) { 332 final ConnectedComponent cc = new ConnectedComponent(pixels); 333 components.add(cc); 334 } 335 336 return components; 337 } 338 339 @Override 340 public void analyseImage(FImage image) { 341 letters = new ArrayList<LetterCandidate>(); 342 lines = new ArrayList<LineCandidate>(); 343 words = new ArrayList<WordCandidate>(); 344 345 if (options.doubleSize) 346 image = ResizeProcessor.doubleSize(image); 347 348 options.direction.detect(image, this); 349 } 350 351 protected void analyseImage(FImage image, FImage swt) { 352 final List<ConnectedComponent> comps = findComponents(swt); 353 final List<LetterCandidate> tmpLetters = LetterCandidate.findLetters(comps, swt, image, options); 354 final List<LineCandidate> tmpLines = LineCandidate.extractLines(tmpLetters, this.options); 355 356 this.letters.addAll(tmpLetters); 357 this.lines.addAll(tmpLines); 358 for (final LineCandidate line : tmpLines) { 359 this.words.addAll(line.words); 360 } 361 } 362 363 /** 364 * Get the detected candidate lines of text 365 * 366 * @return the lines of text 367 */ 368 public List<LineCandidate> getLines() { 369 return lines; 370 } 371 372 /** 373 * Get the unfiltered detected characters. Normally you would want to get 374 * the lines and then get the characters from each line as these will be 375 * filtered. 376 * 377 * @see #getLines() 378 * 379 * @return the unfiltered characters 380 */ 381 public List<LetterCandidate> getLetters() { 382 return letters; 383 } 384}