001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030/** 031 * 032 */ 033package org.openimaj.image.text.extraction; 034 035import java.util.ArrayList; 036import java.util.HashMap; 037import java.util.Iterator; 038import java.util.List; 039import java.util.Map; 040 041import org.openimaj.citation.annotation.Reference; 042import org.openimaj.citation.annotation.ReferenceType; 043import org.openimaj.image.DisplayUtilities; 044import org.openimaj.image.FImage; 045import org.openimaj.image.MBFImage; 046import org.openimaj.image.connectedcomponent.ConnectedComponentLabeler; 047import org.openimaj.image.pixel.ConnectedComponent; 048import org.openimaj.image.pixel.ConnectedComponent.ConnectMode; 049import org.openimaj.image.pixel.Pixel; 050import org.openimaj.image.pixel.PixelSet; 051import org.openimaj.image.processing.convolution.CompassOperators.Compass0; 052import org.openimaj.image.processing.convolution.CompassOperators.Compass135; 053import org.openimaj.image.processing.convolution.CompassOperators.Compass45; 054import org.openimaj.image.processing.convolution.CompassOperators.Compass90; 055import org.openimaj.image.processing.convolution.FConvolution; 056import org.openimaj.image.processing.morphology.Close; 057import org.openimaj.image.processing.morphology.Dilate; 058import org.openimaj.image.processing.morphology.StructuringElement; 059import org.openimaj.image.processing.morphology.Thin; 060import org.openimaj.image.processing.threshold.OtsuThreshold; 061import org.openimaj.image.processing.transform.SkewCorrector; 062import org.openimaj.image.processor.connectedcomponent.ConnectedComponentProcessor; 063import org.openimaj.image.processor.connectedcomponent.render.OrientatedBoundingBoxRenderer; 064import org.openimaj.image.text.ocr.OCRProcessor; 065import org.openimaj.math.geometry.point.Point2d; 066import org.openimaj.math.geometry.point.Point2dImpl; 067import org.openimaj.math.geometry.shape.Polygon; 068import org.openimaj.math.geometry.shape.Rectangle; 069import org.openimaj.util.pair.IndependentPair; 070 071/** 072 * A processor that attempts to extract text from an image. It uses a 3-stage 073 * process: 1) find possible text regions; 2) filter then extract those regions; 074 * 3) OCR. 075 * <p> 076 * In the first stage it builds a feature map which is an image where the pixel 077 * intensity is the likelihood of a pixel being within a text region. It does 078 * this by a series of convolutions and morphological operations that find 079 * regions that have short edges in multiple directions. 080 * <p> 081 * In the second stage, the regions are turned into blobs and those blobs that 082 * are too small or inappropriately shaped are removed. The regions are then 083 * extracted from the original image as subimages containing text. The extracted 084 * subimages can have an expansion multipler applied to the box to ensure that 085 * enough surrounding information is contained within the extracted subimage for 086 * the OCR to work. Use {@link #setBoundingBoxPaddingPc(float)} with a multipler 087 * to expand the bounding boxes with; i.e. 1.05 will expand the bounding box by 088 * 5%. 089 * <p> 090 * The third stage simply uses an {@link OCRProcessor} to process the subimages 091 * and extract textual strings. Use the {@link #setOCRProcessor(OCRProcessor)} 092 * to set the {@link OCRProcessor} to use to extract text. Note that by default 093 * no processor is set. If the processor is executed without an 094 * {@link OCRProcessor} being set, the OCR stage will not occur. This part of 095 * the implementation has moved into {@link TextExtractor} super class. 096 * <p> 097 * The output of the processor can be retrieved using {@link #getTextRegions()} 098 * which returns a map where the key is a bounding box of every detected text 099 * region and the value is a pair of subimage to extracted text. 100 * <p> 101 * From: [paper 01626635.pdf] Xiaoqing Liu and Jagath Samarabandu; An Edge-based 102 * Text Region Extraction Algorithm for Indoor Mobile Robot Navigation, 103 * Proceedings of the IEEE International Conference on Mechatronics & Automation 104 * Niagara Falls, Canada, July 2005 105 * 106 * @see "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1626635" 107 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 108 * @created 29 Jul 2011 109 * 110 */ 111@Reference( 112 type = ReferenceType.Inproceedings, 113 author = { "Xiaoqing Liu", "Samarabandu, J." }, 114 title = "An edge-based text region extraction algorithm for indoor mobile robot navigation", 115 year = "2005", 116 booktitle = "Mechatronics and Automation, 2005 IEEE International Conference", 117 pages = { " 701 ", " 706 Vol. 2" }, 118 month = "July-1 Aug.", 119 number = "", 120 volume = "2", 121 customData = { 122 "keywords", 123 "edge-based text region extraction; feature extraction; scene text; text localization; vision-based mobile robot navigation; character recognition; edge detection; feature extraction; mobile robots; navigation; path planning; robot vision;", 124 "doi", "10.1109/ICMA.2005.1626635", "ISSN", "" }) 125public class LiuSamarabanduTextExtractorBasic extends TextExtractor<FImage> 126{ 127 /** Whether to debug the text extractor - displaying images as it goes */ 128 public static final boolean DEBUG = false; 129 130 /** Percentage of size to add around the bounding box of a text region */ 131 private float boundingBoxPaddingPc = 1.1f; 132 133 /** The output of the processor. Use #getTextRegions() */ 134 private Map<Rectangle, FImage> textRegions = null; 135 136 /** 137 * Helper method that convolves the given image with the given convolution 138 * operator. The method also performs an abs() and a normalise(). We can 139 * take the absolute value as we're only interested in whether edges are, 140 * for example, vertical and not in what direction they are. 141 * 142 * @param img 143 * The image to convolve 144 * @param c 145 * The convolution operator 146 * @return A convolved image. 147 */ 148 private FImage processImage(FImage img, FConvolution c) 149 { 150 return img.process(c) 151 .abs() 152 .normalise(); 153 } 154 155 /** 156 * {@inheritDoc} 157 * 158 * @see org.openimaj.image.processor.ImageProcessor#processImage(org.openimaj.image.Image) 159 */ 160 @Override 161 public void processImage(FImage image) 162 { 163 // Find which regions might be text 164 final FImage fmap = textRegionDetection(image); 165 166 // Process the feature map 167 processFeatureMap(fmap, image); 168 169 // The output process image is the feature map 170 image.internalAssign(fmap); 171 } 172 173 /** 174 * Process a feature map. This function will side affect the field 175 * <code>textRegions</code> in this class. Use {@link #getTextRegions()} to 176 * retrieve the text regions extracted from this method. 177 * 178 * @param fmap 179 * The feature map to process 180 * @param image 181 * The original image. 182 */ 183 public void processFeatureMap(FImage fmap, FImage image) 184 { 185 // Extract the text regions from the image 186 final Map<Rectangle, FImage> t = textRegionLocalisation(fmap, image); 187 this.textRegions = t; 188 } 189 190 /** 191 * Calculate the feature map that give the approximate localisation of 192 * candidate text regions. 193 * 194 * @param image 195 * The image to process. 196 * @return The feature map 197 */ 198 public FImage textRegionDetection(FImage image) 199 { 200 // 1) Directional Filtering 201 final HashMap<Integer, FImage> e = new HashMap<Integer, FImage>(); 202 e.put(0, processImage(image, new Compass0())); 203 e.put(45, processImage(image, new Compass45())); 204 e.put(90, processImage(image, new Compass90())); 205 e.put(135, processImage(image, new Compass135())); 206 207 // 2) Edge Selection 208 // 2a) Get strong edges 209 final FImage e90strong = e.get(90).process(new OtsuThreshold()); 210 211 if (DEBUG) 212 DisplayUtilities.display(e90strong, "Strong Edges"); 213 214 // 2b) Get weak edges 215 // 2b)i) Dilate: 216 // Use a horizontal 1x3 structuring element 217 final StructuringElement se = new StructuringElement(); 218 se.positive.add(new Pixel(0, 0)); 219 se.positive.add(new Pixel(-1, 0)); 220 se.positive.add(new Pixel(1, 0)); 221 222 if (DEBUG) 223 System.out.println("Dilating with a 1x3 structuring element"); 224 225 final FImage dilated = e90strong.process(new Dilate(se)); 226 227 // 2b)ii) Close: 228 // Use a vertical mx1 structuring element 229 int m = (int) (dilated.getHeight() / 25d); 230 231 if (DEBUG) 232 System.out.println("Closing with a " + m + "x1 structuring element."); 233 234 final StructuringElement se2 = new StructuringElement(); 235 for (int i = 0; i < m; i++) 236 se2.positive.add(new Pixel(0, i - m / 2)); 237 final FImage closed = dilated.process(new Close(se2)); 238 239 // 2b)iii) E90w = |E90 x (closed-dilated)|z 240 // |.|z is the Otsu threshold operator 241 FImage e90weak = closed.subtract(dilated).abs(); 242 e90weak.multiplyInplace(e.get(90)); 243 e90weak = e90weak.process(new OtsuThreshold()); 244 245 if (DEBUG) 246 DisplayUtilities.display(e90weak, "Weak Edges"); 247 248 final FImage e90edges = e90strong.add(e90weak).normalise().process( 249 new OtsuThreshold()); 250 251 if (DEBUG) 252 DisplayUtilities.display(e90edges, "Edges"); 253 254 // 3a) Thin edges 255 final FImage e90thin = e90edges.process(new Thin(StructuringElement.BOX)); 256 257 if (DEBUG) 258 DisplayUtilities.display(e90thin, "Thinned"); 259 260 final ConnectedComponentLabeler ccl = new ConnectedComponentLabeler( 261 ConnectMode.CONNECT_4); 262 final List<ConnectedComponent> cc = ccl.findComponents(e90thin); 263 264 // 4a) Label edges 265 final FImage e90labelled = new FImage(e90thin.getWidth(), e90thin.getHeight()); 266 final ConnectedComponentProcessor ccp = new ConnectedComponentProcessor() 267 { 268 @Override 269 public void process(ConnectedComponent cc) 270 { 271 final int a = cc.calculateArea(); 272 for (final Pixel p : cc.pixels) 273 e90labelled.setPixel((int) p.getX(), (int) p.getY(), (float) a); 274 } 275 }; 276 ConnectedComponent.process(cc, ccp); 277 278 if (DEBUG) { 279 DisplayUtilities.display(e90labelled.clone().normalise(), "Labelled Edges"); 280 System.out.println("Max edge length: " + e90labelled.max()); 281 } 282 283 // Threshold the labelled edges to get only short ones 284 final FImage e90short = e90labelled.clone().clip(0f, 1f).subtract( 285 e90labelled.threshold(e90labelled.max() / 4 * 3)); 286 287 if (DEBUG) 288 DisplayUtilities.display(e90short.clone().normalise(), "Thresholded Lengths"); 289 290 // 5) Feature Map Generation 291 // Suppress false regions and enhance candidate regions 292 // 5a) candidate = Dilation( e90short )[mxm] 293 final StructuringElement se3 = new StructuringElement(); 294 for (int i = 0; i < m; i++) 295 for (int j = 0; j < m; j++) 296 se3.positive.add(new Pixel(i - m / 2, j - m / 2)); 297 final FImage e90candidate = e90short.process(new Dilate(se3)); 298 299 if (DEBUG) 300 DisplayUtilities.display(e90candidate, "Candidate Regions"); 301 302 // 5b) refined = candidate * sum( e0,e45,e90,e135 ); 303 final FImage is = e.get(0).clone(). 304 addInplace(e.get(45)). 305 addInplace(e.get(90)). 306 addInplace(e.get(135)); 307 308 // We normalise the refined so that we can more simply 309 // normalise the pixel values in the feature map generation 310 // by the window size as we know each pixel is (0,1) 311 final FImage refined = e90candidate.multiply(is).normalise(); 312 313 if (DEBUG) 314 DisplayUtilities.display(refined, "Refined"); 315 316 // 5c) fmap(i,j) = N{W(i,j).sum[-c<m<c]( sum[-c<n<c]( refined(i+m,j+n) ) 317 // ) } 318 final int c = 5; // window size -- value not specified in paper 319 final FImage fmap = new FImage(image.getWidth(), image.getHeight()); 320 321 // Unlike the paper, we use the actual values of the edges 322 // in each of the direction images to calculate the weight 323 // for each pixel in the pixel map. So, instead of counting 324 // (which would require thresholding the direction images), 325 // we simply add the values of the maximum edges in each 326 // of the direction images; it means if there are 4 directions 327 // strongly represented in the window the weight will be near 328 // 4 (before normalisation); if there is only 1, the weight will 329 // be near 1. Anyway, we have to store the maximum for each direction 330 // image within the window which is what this hashmap is for, 331 // and what the updateMaxPixDir() private method is used for. 332 final HashMap<Integer, Float> maxPixDir = new HashMap<Integer, Float>(); 333 334 // For every pixel in the feature map 335 for (int j = c; j < image.getHeight() - c; j++) 336 { 337 for (int i = c; i < image.getWidth() - c; i++) 338 { 339 float pixelValue = 0; 340 final float N = c * c; 341 maxPixDir.clear(); 342 343 // Look in the window 344 for (m = -c; m < c; m++) 345 { 346 for (int n = -c; n < c; n++) 347 { 348 pixelValue += refined.getPixel(i + m, j + n); 349 350 updateMaxPixDir(maxPixDir, e, 0, i + m, j + n); 351 updateMaxPixDir(maxPixDir, e, 45, i + m, j + n); 352 updateMaxPixDir(maxPixDir, e, 90, i + m, j + n); 353 updateMaxPixDir(maxPixDir, e, 135, i + m, j + n); 354 } 355 } 356 357 float w = maxPixDir.get(0) + 358 maxPixDir.get(45) + 359 maxPixDir.get(90) + 360 maxPixDir.get(135); 361 w /= 4; // normalise the weight so it's between 0 and 1 362 363 pixelValue *= w; 364 pixelValue /= N; 365 366 fmap.setPixel(i, j, pixelValue); 367 } 368 } 369 370 if (DEBUG) 371 DisplayUtilities.display(fmap.clone().normalise(), "Feature Map"); 372 373 return fmap; 374 } 375 376 /** 377 * Helper method to store the maximum pixel value for a given direction at a 378 * given x and y coordinate. 379 * 380 * @param maxPixDir 381 * The hashmap in which the maxes are stored 382 * @param e 383 * The hashmap of direction images 384 * @param dir 385 * The direction to dereference 386 * @param x 387 * the x-coord 388 * @param y 389 * the y-coord 390 */ 391 private void updateMaxPixDir(HashMap<Integer, Float> maxPixDir, HashMap<Integer, FImage> e, int dir, int x, int y) 392 { 393 Float xx = null; 394 if ((xx = maxPixDir.get(dir)) == null) 395 maxPixDir.put(dir, e.get(dir).getPixel(x, y)); 396 else 397 maxPixDir.put(dir, Math.max(xx, e.get(dir).getPixel(x, y))); 398 } 399 400 /** 401 * Extract the regions that probably contain text (as given by the feature 402 * map) 403 * 404 * @param fmap 405 * The feature map calculated from 406 * {@link #textRegionDetection(FImage)} 407 * @param image 408 * The original image 409 * @return A map of boundingbox->images that area localised text regions 410 */ 411 public Map<Rectangle, FImage> textRegionLocalisation(FImage fmap, FImage image) 412 { 413 // We'll store the localised text regions in this list 414 final HashMap<Rectangle, FImage> textAreas = new HashMap<Rectangle, FImage>(); 415 416 // Threshold the image to find high probability regions 417 final FImage thresh = fmap.clone().normalise().process(new OtsuThreshold()); 418 419 // Dilate with a 7x7 structuring element 420 final StructuringElement se = new StructuringElement(); 421 final int ses = 9; 422 for (int i = 0; i < ses; i++) 423 for (int j = 0; j < ses; j++) 424 se.positive.add(new Pixel(i, j)); 425 426 final FImage dilated = thresh.process(new Dilate(se)); 427 428 if (DEBUG) 429 DisplayUtilities.display(dilated, "Candidate text-blobs"); 430 431 // We use the connected-component labeller to extract the components. 432 final ConnectedComponentLabeler ccl = new 433 ConnectedComponentLabeler(ConnectMode.CONNECT_4); 434 final List<ConnectedComponent> ccs = ccl.findComponents(dilated); 435 436 System.out.println("Got " + ccs.size() + " connected components."); 437 438 // Now we can filter by iterating over the components 439 // Heuristics: 440 // Area(region) >= (1/20).max 441 int maxArea = 0; 442 for (final PixelSet cc : ccs) 443 maxArea = Math.max(maxArea, cc.calculateArea()); 444 445 // - Remove regions that are too small 446 for (final Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext();) 447 if (cci.next().calculateArea() < maxArea / 20d) 448 cci.remove(); 449 450 // Ratio(w/h) = w/h >= 0.2 451 // - Remove regions that aren't square enough. 452 for (final Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext();) 453 { 454 final PixelSet cc = cci.next(); 455 final Rectangle r = cc.calculateRegularBoundingBox(); 456 if (r.width / r.height < 0.2) 457 cci.remove(); 458 } 459 460 if (DEBUG) { 461 final MBFImage bb = new MBFImage(image.getWidth(), image.getHeight(), 3); 462 bb.createRenderer().drawImage(image, 0, 0); 463 final OrientatedBoundingBoxRenderer<Float[]> obbr = new 464 OrientatedBoundingBoxRenderer<Float[]>(bb, new Float[] { 1f, 1f, 0f }); 465 ConnectedComponent.process(ccs, obbr); 466 DisplayUtilities.display(bb); 467 System.out.println("Continuing with " + ccs.size() + " connected components."); 468 } 469 470 // Extract the text regions from the original image 471 for (final PixelSet cc : ccs) 472 { 473 if (cc.getPixels().size() < 20) 474 continue; 475 476 // Extract from the image the text region 477 final Rectangle r = cc.calculateRegularBoundingBox(); 478 r.scaleCentroid(boundingBoxPaddingPc); 479 FImage textArea = image.extractROI(r); 480 481 // Threshold of the image make it easier to extract MSERs 482 final OtsuThreshold o = new OtsuThreshold(); 483 o.processImage(textArea); 484 485 if (DEBUG) 486 DisplayUtilities.display(textArea, "text area - before distortion"); 487 488 // // We distort the image to attempt to make the lettering 489 // straighter 490 // // and more readable for the OCR 491 // // We work out the bounding box of the text to distort 492 // Polygon p = cc.calculateOrientatedBoundingBox(); 493 // 494 // // and the mapping to distort the text to a regular rectangle 495 // List<IndependentPair<Point2d,Point2d>> pointPairs = 496 // calculateHomography( p ); 497 // 498 // // Calculate the homography matrix to distort the image. 499 // Matrix homoMatrix = TransformUtilities.homographyMatrix( 500 // pointPairs ); 501 // 502 // // Transform the image 503 // textArea = textArea.transform( homoMatrix ); 504 505 final SkewCorrector sc = new SkewCorrector(); 506 sc.setAccuracy(4); 507 textArea = textArea.process(sc); 508 509 // Store the detected text and the distorted image 510 textAreas.put(r, textArea); 511 512 if (DEBUG) 513 DisplayUtilities.display(textArea, "text area - after distortion"); 514 } 515 516 return textAreas; 517 } 518 519 /** 520 * Calculates the point pairing for a given distorted polygon into 521 * orthogonal space. 522 * 523 * @param p 524 * The polygon with 4 points 525 * @return A list of point pairs 526 */ 527 public List<IndependentPair<Point2d, Point2d>> calculateHomography(Polygon p) 528 { 529 // Our output list 530 final List<IndependentPair<Point2d, Point2d>> pointPairs = new 531 ArrayList<IndependentPair<Point2d, Point2d>>(); 532 533 // Get the vertices 534 // 535 // p1----------p4 536 // | | 537 // p2----------p3 538 // 539 final List<Point2d> v = p.getVertices(); 540 final Point2d p1 = v.get(0); 541 final Point2d p2 = v.get(1); 542 final Point2d p3 = v.get(2); 543 final Point2d p4 = v.get(3); 544 545 // Mapped vertices 546 final Point2d p1p = new Point2dImpl(p2.getX(), p1.getY()); // vertical 547 // above p2 548 final Point2d p2p = v.get(1); // don't move 549 final Point2d p3p = new Point2dImpl(p3.getX(), p2.getY()); // horizontal 550 // from p2 551 final Point2d p4p = new Point2dImpl(p3p.getX(), p1.getY()); // vertical 552 // above p3 553 554 pointPairs.add(new IndependentPair<Point2d, Point2d>(p1, p1p)); 555 pointPairs.add(new IndependentPair<Point2d, Point2d>(p2, p2p)); 556 pointPairs.add(new IndependentPair<Point2d, Point2d>(p3, p3p)); 557 pointPairs.add(new IndependentPair<Point2d, Point2d>(p4, p4p)); 558 559 return pointPairs; 560 } 561 562 /** 563 * Get the expansion value of the bounding boxes that are generated for the 564 * text regions. 565 * 566 * @return the new bounding box expansion multiplier. 567 */ 568 public float getBoundingBoxPaddingPc() 569 { 570 return boundingBoxPaddingPc; 571 } 572 573 /** 574 * Set the expansion value for the subimage extraction. 575 * 576 * @param boundingBoxPaddingPc 577 * the new multiplier 578 */ 579 public void setBoundingBoxPaddingPc(float boundingBoxPaddingPc) 580 { 581 this.boundingBoxPaddingPc = boundingBoxPaddingPc; 582 } 583 584 /** 585 * Returns a map of bounding box to image and textual string. 586 * 587 * @return A map of image bounding box to subimage and text string. 588 */ 589 @Override 590 public Map<Rectangle, FImage> getTextRegions() 591 { 592 return this.textRegions; 593 } 594}