001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 *
032 */
033package org.openimaj.image.text.extraction;
034
035import java.util.ArrayList;
036import java.util.HashMap;
037import java.util.Iterator;
038import java.util.List;
039import java.util.Map;
040
041import org.openimaj.citation.annotation.Reference;
042import org.openimaj.citation.annotation.ReferenceType;
043import org.openimaj.image.DisplayUtilities;
044import org.openimaj.image.FImage;
045import org.openimaj.image.MBFImage;
046import org.openimaj.image.connectedcomponent.ConnectedComponentLabeler;
047import org.openimaj.image.pixel.ConnectedComponent;
048import org.openimaj.image.pixel.ConnectedComponent.ConnectMode;
049import org.openimaj.image.pixel.Pixel;
050import org.openimaj.image.pixel.PixelSet;
051import org.openimaj.image.processing.convolution.CompassOperators.Compass0;
052import org.openimaj.image.processing.convolution.CompassOperators.Compass135;
053import org.openimaj.image.processing.convolution.CompassOperators.Compass45;
054import org.openimaj.image.processing.convolution.CompassOperators.Compass90;
055import org.openimaj.image.processing.convolution.FConvolution;
056import org.openimaj.image.processing.morphology.Close;
057import org.openimaj.image.processing.morphology.Dilate;
058import org.openimaj.image.processing.morphology.StructuringElement;
059import org.openimaj.image.processing.morphology.Thin;
060import org.openimaj.image.processing.threshold.OtsuThreshold;
061import org.openimaj.image.processing.transform.SkewCorrector;
062import org.openimaj.image.processor.connectedcomponent.ConnectedComponentProcessor;
063import org.openimaj.image.processor.connectedcomponent.render.OrientatedBoundingBoxRenderer;
064import org.openimaj.image.text.ocr.OCRProcessor;
065import org.openimaj.math.geometry.point.Point2d;
066import org.openimaj.math.geometry.point.Point2dImpl;
067import org.openimaj.math.geometry.shape.Polygon;
068import org.openimaj.math.geometry.shape.Rectangle;
069import org.openimaj.util.pair.IndependentPair;
070
071/**
072 * A processor that attempts to extract text from an image. It uses a 3-stage
073 * process: 1) find possible text regions; 2) filter then extract those regions;
074 * 3) OCR.
075 * <p>
076 * In the first stage it builds a feature map which is an image where the pixel
077 * intensity is the likelihood of a pixel being within a text region. It does
078 * this by a series of convolutions and morphological operations that find
079 * regions that have short edges in multiple directions.
080 * <p>
081 * In the second stage, the regions are turned into blobs and those blobs that
082 * are too small or inappropriately shaped are removed. The regions are then
083 * extracted from the original image as subimages containing text. The extracted
084 * subimages can have an expansion multipler applied to the box to ensure that
085 * enough surrounding information is contained within the extracted subimage for
086 * the OCR to work. Use {@link #setBoundingBoxPaddingPc(float)} with a multipler
087 * to expand the bounding boxes with; i.e. 1.05 will expand the bounding box by
088 * 5%.
089 * <p>
090 * The third stage simply uses an {@link OCRProcessor} to process the subimages
091 * and extract textual strings. Use the {@link #setOCRProcessor(OCRProcessor)}
092 * to set the {@link OCRProcessor} to use to extract text. Note that by default
093 * no processor is set. If the processor is executed without an
094 * {@link OCRProcessor} being set, the OCR stage will not occur. This part of
095 * the implementation has moved into {@link TextExtractor} super class.
096 * <p>
097 * The output of the processor can be retrieved using {@link #getTextRegions()}
098 * which returns a map where the key is a bounding box of every detected text
099 * region and the value is a pair of subimage to extracted text.
100 * <p>
101 * From: [paper 01626635.pdf] Xiaoqing Liu and Jagath Samarabandu; An Edge-based
102 * Text Region Extraction Algorithm for Indoor Mobile Robot Navigation,
103 * Proceedings of the IEEE International Conference on Mechatronics & Automation
104 * Niagara Falls, Canada, July 2005
105 *
106 * @see "http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1626635"
107 * @author David Dupplaw (dpd@ecs.soton.ac.uk)
108 * @created 29 Jul 2011
109 *
110 */
111@Reference(
112                type = ReferenceType.Inproceedings,
113                author = { "Xiaoqing Liu", "Samarabandu, J." },
114                title = "An edge-based text region extraction algorithm for indoor mobile robot navigation",
115                year = "2005",
116                booktitle = "Mechatronics and Automation, 2005 IEEE International Conference",
117                pages = { " 701 ", " 706 Vol. 2" },
118                month = "July-1 Aug.",
119                number = "",
120                volume = "2",
121                customData = {
122                                "keywords",
123                                "edge-based text region extraction; feature extraction; scene text; text localization; vision-based mobile robot navigation; character recognition; edge detection; feature extraction; mobile robots; navigation; path planning; robot vision;",
124                                "doi", "10.1109/ICMA.2005.1626635", "ISSN", "" })
125public class LiuSamarabanduTextExtractorBasic extends TextExtractor<FImage>
126{
127        /** Whether to debug the text extractor - displaying images as it goes */
128        public static final boolean DEBUG = false;
129
130        /** Percentage of size to add around the bounding box of a text region */
131        private float boundingBoxPaddingPc = 1.1f;
132
133        /** The output of the processor. Use #getTextRegions() */
134        private Map<Rectangle, FImage> textRegions = null;
135
136        /**
137         * Helper method that convolves the given image with the given convolution
138         * operator. The method also performs an abs() and a normalise(). We can
139         * take the absolute value as we're only interested in whether edges are,
140         * for example, vertical and not in what direction they are.
141         *
142         * @param img
143         *            The image to convolve
144         * @param c
145         *            The convolution operator
146         * @return A convolved image.
147         */
148        private FImage processImage(FImage img, FConvolution c)
149        {
150                return img.process(c)
151                                .abs()
152                                .normalise();
153        }
154
155        /**
156         * {@inheritDoc}
157         * 
158         * @see org.openimaj.image.processor.ImageProcessor#processImage(org.openimaj.image.Image)
159         */
160        @Override
161        public void processImage(FImage image)
162        {
163                // Find which regions might be text
164                final FImage fmap = textRegionDetection(image);
165
166                // Process the feature map
167                processFeatureMap(fmap, image);
168
169                // The output process image is the feature map
170                image.internalAssign(fmap);
171        }
172
173        /**
174         * Process a feature map. This function will side affect the field
175         * <code>textRegions</code> in this class. Use {@link #getTextRegions()} to
176         * retrieve the text regions extracted from this method.
177         *
178         * @param fmap
179         *            The feature map to process
180         * @param image
181         *            The original image.
182         */
183        public void processFeatureMap(FImage fmap, FImage image)
184        {
185                // Extract the text regions from the image
186                final Map<Rectangle, FImage> t = textRegionLocalisation(fmap, image);
187                this.textRegions = t;
188        }
189
190        /**
191         * Calculate the feature map that give the approximate localisation of
192         * candidate text regions.
193         *
194         * @param image
195         *            The image to process.
196         * @return The feature map
197         */
198        public FImage textRegionDetection(FImage image)
199        {
200                // 1) Directional Filtering
201                final HashMap<Integer, FImage> e = new HashMap<Integer, FImage>();
202                e.put(0, processImage(image, new Compass0()));
203                e.put(45, processImage(image, new Compass45()));
204                e.put(90, processImage(image, new Compass90()));
205                e.put(135, processImage(image, new Compass135()));
206
207                // 2) Edge Selection
208                // 2a) Get strong edges
209                final FImage e90strong = e.get(90).process(new OtsuThreshold());
210
211                if (DEBUG)
212                        DisplayUtilities.display(e90strong, "Strong Edges");
213
214                // 2b) Get weak edges
215                // 2b)i) Dilate:
216                // Use a horizontal 1x3 structuring element
217                final StructuringElement se = new StructuringElement();
218                se.positive.add(new Pixel(0, 0));
219                se.positive.add(new Pixel(-1, 0));
220                se.positive.add(new Pixel(1, 0));
221
222                if (DEBUG)
223                        System.out.println("Dilating with a 1x3 structuring element");
224
225                final FImage dilated = e90strong.process(new Dilate(se));
226
227                // 2b)ii) Close:
228                // Use a vertical mx1 structuring element
229                int m = (int) (dilated.getHeight() / 25d);
230
231                if (DEBUG)
232                        System.out.println("Closing with a " + m + "x1 structuring element.");
233
234                final StructuringElement se2 = new StructuringElement();
235                for (int i = 0; i < m; i++)
236                        se2.positive.add(new Pixel(0, i - m / 2));
237                final FImage closed = dilated.process(new Close(se2));
238
239                // 2b)iii) E90w = |E90 x (closed-dilated)|z
240                // |.|z is the Otsu threshold operator
241                FImage e90weak = closed.subtract(dilated).abs();
242                e90weak.multiplyInplace(e.get(90));
243                e90weak = e90weak.process(new OtsuThreshold());
244
245                if (DEBUG)
246                        DisplayUtilities.display(e90weak, "Weak Edges");
247
248                final FImage e90edges = e90strong.add(e90weak).normalise().process(
249                                new OtsuThreshold());
250
251                if (DEBUG)
252                        DisplayUtilities.display(e90edges, "Edges");
253
254                // 3a) Thin edges
255                final FImage e90thin = e90edges.process(new Thin(StructuringElement.BOX));
256
257                if (DEBUG)
258                        DisplayUtilities.display(e90thin, "Thinned");
259
260                final ConnectedComponentLabeler ccl = new ConnectedComponentLabeler(
261                                ConnectMode.CONNECT_4);
262                final List<ConnectedComponent> cc = ccl.findComponents(e90thin);
263
264                // 4a) Label edges
265                final FImage e90labelled = new FImage(e90thin.getWidth(), e90thin.getHeight());
266                final ConnectedComponentProcessor ccp = new ConnectedComponentProcessor()
267                {
268                        @Override
269                        public void process(ConnectedComponent cc)
270                        {
271                                final int a = cc.calculateArea();
272                                for (final Pixel p : cc.pixels)
273                                        e90labelled.setPixel((int) p.getX(), (int) p.getY(), (float) a);
274                        }
275                };
276                ConnectedComponent.process(cc, ccp);
277
278                if (DEBUG) {
279                        DisplayUtilities.display(e90labelled.clone().normalise(), "Labelled Edges");
280                        System.out.println("Max edge length: " + e90labelled.max());
281                }
282
283                // Threshold the labelled edges to get only short ones
284                final FImage e90short = e90labelled.clone().clip(0f, 1f).subtract(
285                                e90labelled.threshold(e90labelled.max() / 4 * 3));
286
287                if (DEBUG)
288                        DisplayUtilities.display(e90short.clone().normalise(), "Thresholded Lengths");
289
290                // 5) Feature Map Generation
291                // Suppress false regions and enhance candidate regions
292                // 5a) candidate = Dilation( e90short )[mxm]
293                final StructuringElement se3 = new StructuringElement();
294                for (int i = 0; i < m; i++)
295                        for (int j = 0; j < m; j++)
296                                se3.positive.add(new Pixel(i - m / 2, j - m / 2));
297                final FImage e90candidate = e90short.process(new Dilate(se3));
298
299                if (DEBUG)
300                        DisplayUtilities.display(e90candidate, "Candidate Regions");
301
302                // 5b) refined = candidate * sum( e0,e45,e90,e135 );
303                final FImage is = e.get(0).clone().
304                                addInplace(e.get(45)).
305                                addInplace(e.get(90)).
306                                addInplace(e.get(135));
307
308                // We normalise the refined so that we can more simply
309                // normalise the pixel values in the feature map generation
310                // by the window size as we know each pixel is (0,1)
311                final FImage refined = e90candidate.multiply(is).normalise();
312
313                if (DEBUG)
314                        DisplayUtilities.display(refined, "Refined");
315
316                // 5c) fmap(i,j) = N{W(i,j).sum[-c<m<c]( sum[-c<n<c]( refined(i+m,j+n) )
317                // ) }
318                final int c = 5; // window size -- value not specified in paper
319                final FImage fmap = new FImage(image.getWidth(), image.getHeight());
320
321                // Unlike the paper, we use the actual values of the edges
322                // in each of the direction images to calculate the weight
323                // for each pixel in the pixel map. So, instead of counting
324                // (which would require thresholding the direction images),
325                // we simply add the values of the maximum edges in each
326                // of the direction images; it means if there are 4 directions
327                // strongly represented in the window the weight will be near
328                // 4 (before normalisation); if there is only 1, the weight will
329                // be near 1. Anyway, we have to store the maximum for each direction
330                // image within the window which is what this hashmap is for,
331                // and what the updateMaxPixDir() private method is used for.
332                final HashMap<Integer, Float> maxPixDir = new HashMap<Integer, Float>();
333
334                // For every pixel in the feature map
335                for (int j = c; j < image.getHeight() - c; j++)
336                {
337                        for (int i = c; i < image.getWidth() - c; i++)
338                        {
339                                float pixelValue = 0;
340                                final float N = c * c;
341                                maxPixDir.clear();
342
343                                // Look in the window
344                                for (m = -c; m < c; m++)
345                                {
346                                        for (int n = -c; n < c; n++)
347                                        {
348                                                pixelValue += refined.getPixel(i + m, j + n);
349
350                                                updateMaxPixDir(maxPixDir, e, 0, i + m, j + n);
351                                                updateMaxPixDir(maxPixDir, e, 45, i + m, j + n);
352                                                updateMaxPixDir(maxPixDir, e, 90, i + m, j + n);
353                                                updateMaxPixDir(maxPixDir, e, 135, i + m, j + n);
354                                        }
355                                }
356
357                                float w = maxPixDir.get(0) +
358                                                maxPixDir.get(45) +
359                                                maxPixDir.get(90) +
360                                                maxPixDir.get(135);
361                                w /= 4; // normalise the weight so it's between 0 and 1
362
363                                pixelValue *= w;
364                                pixelValue /= N;
365
366                                fmap.setPixel(i, j, pixelValue);
367                        }
368                }
369
370                if (DEBUG)
371                        DisplayUtilities.display(fmap.clone().normalise(), "Feature Map");
372
373                return fmap;
374        }
375
376        /**
377         * Helper method to store the maximum pixel value for a given direction at a
378         * given x and y coordinate.
379         *
380         * @param maxPixDir
381         *            The hashmap in which the maxes are stored
382         * @param e
383         *            The hashmap of direction images
384         * @param dir
385         *            The direction to dereference
386         * @param x
387         *            the x-coord
388         * @param y
389         *            the y-coord
390         */
391        private void updateMaxPixDir(HashMap<Integer, Float> maxPixDir, HashMap<Integer, FImage> e, int dir, int x, int y)
392        {
393                Float xx = null;
394                if ((xx = maxPixDir.get(dir)) == null)
395                        maxPixDir.put(dir, e.get(dir).getPixel(x, y));
396                else
397                        maxPixDir.put(dir, Math.max(xx, e.get(dir).getPixel(x, y)));
398        }
399
400        /**
401         * Extract the regions that probably contain text (as given by the feature
402         * map)
403         *
404         * @param fmap
405         *            The feature map calculated from
406         *            {@link #textRegionDetection(FImage)}
407         * @param image
408         *            The original image
409         * @return A map of boundingbox->images that area localised text regions
410         */
411        public Map<Rectangle, FImage> textRegionLocalisation(FImage fmap, FImage image)
412        {
413                // We'll store the localised text regions in this list
414                final HashMap<Rectangle, FImage> textAreas = new HashMap<Rectangle, FImage>();
415
416                // Threshold the image to find high probability regions
417                final FImage thresh = fmap.clone().normalise().process(new OtsuThreshold());
418
419                // Dilate with a 7x7 structuring element
420                final StructuringElement se = new StructuringElement();
421                final int ses = 9;
422                for (int i = 0; i < ses; i++)
423                        for (int j = 0; j < ses; j++)
424                                se.positive.add(new Pixel(i, j));
425
426                final FImage dilated = thresh.process(new Dilate(se));
427
428                if (DEBUG)
429                        DisplayUtilities.display(dilated, "Candidate text-blobs");
430
431                // We use the connected-component labeller to extract the components.
432                final ConnectedComponentLabeler ccl = new
433                                ConnectedComponentLabeler(ConnectMode.CONNECT_4);
434                final List<ConnectedComponent> ccs = ccl.findComponents(dilated);
435
436                System.out.println("Got " + ccs.size() + " connected components.");
437
438                // Now we can filter by iterating over the components
439                // Heuristics:
440                // Area(region) >= (1/20).max
441                int maxArea = 0;
442                for (final PixelSet cc : ccs)
443                        maxArea = Math.max(maxArea, cc.calculateArea());
444
445                // - Remove regions that are too small
446                for (final Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext();)
447                        if (cci.next().calculateArea() < maxArea / 20d)
448                                cci.remove();
449
450                // Ratio(w/h) = w/h >= 0.2
451                // - Remove regions that aren't square enough.
452                for (final Iterator<ConnectedComponent> cci = ccs.iterator(); cci.hasNext();)
453                {
454                        final PixelSet cc = cci.next();
455                        final Rectangle r = cc.calculateRegularBoundingBox();
456                        if (r.width / r.height < 0.2)
457                                cci.remove();
458                }
459
460                if (DEBUG) {
461                        final MBFImage bb = new MBFImage(image.getWidth(), image.getHeight(), 3);
462                        bb.createRenderer().drawImage(image, 0, 0);
463                        final OrientatedBoundingBoxRenderer<Float[]> obbr = new
464                                        OrientatedBoundingBoxRenderer<Float[]>(bb, new Float[] { 1f, 1f, 0f });
465                                        ConnectedComponent.process(ccs, obbr);
466                                        DisplayUtilities.display(bb);
467                                        System.out.println("Continuing with " + ccs.size() + " connected components.");
468                }
469
470                // Extract the text regions from the original image
471                for (final PixelSet cc : ccs)
472                {
473                        if (cc.getPixels().size() < 20)
474                                continue;
475
476                        // Extract from the image the text region
477                        final Rectangle r = cc.calculateRegularBoundingBox();
478                        r.scaleCentroid(boundingBoxPaddingPc);
479                        FImage textArea = image.extractROI(r);
480
481                        // Threshold of the image make it easier to extract MSERs
482                        final OtsuThreshold o = new OtsuThreshold();
483                        o.processImage(textArea);
484
485                        if (DEBUG)
486                                DisplayUtilities.display(textArea, "text area - before distortion");
487
488                        // // We distort the image to attempt to make the lettering
489                        // straighter
490                        // // and more readable for the OCR
491                        // // We work out the bounding box of the text to distort
492                        // Polygon p = cc.calculateOrientatedBoundingBox();
493                        //
494                        // // and the mapping to distort the text to a regular rectangle
495                        // List<IndependentPair<Point2d,Point2d>> pointPairs =
496                        // calculateHomography( p );
497                        //
498                        // // Calculate the homography matrix to distort the image.
499                        // Matrix homoMatrix = TransformUtilities.homographyMatrix(
500                        // pointPairs );
501                        //
502                        // // Transform the image
503                        // textArea = textArea.transform( homoMatrix );
504
505                        final SkewCorrector sc = new SkewCorrector();
506                        sc.setAccuracy(4);
507                        textArea = textArea.process(sc);
508
509                        // Store the detected text and the distorted image
510                        textAreas.put(r, textArea);
511
512                        if (DEBUG)
513                                DisplayUtilities.display(textArea, "text area - after distortion");
514                }
515
516                return textAreas;
517        }
518
519        /**
520         * Calculates the point pairing for a given distorted polygon into
521         * orthogonal space.
522         *
523         * @param p
524         *            The polygon with 4 points
525         * @return A list of point pairs
526         */
527        public List<IndependentPair<Point2d, Point2d>> calculateHomography(Polygon p)
528        {
529                // Our output list
530                final List<IndependentPair<Point2d, Point2d>> pointPairs = new
531                                ArrayList<IndependentPair<Point2d, Point2d>>();
532
533                // Get the vertices
534                //
535                // p1----------p4
536                // | |
537                // p2----------p3
538                //
539                final List<Point2d> v = p.getVertices();
540                final Point2d p1 = v.get(0);
541                final Point2d p2 = v.get(1);
542                final Point2d p3 = v.get(2);
543                final Point2d p4 = v.get(3);
544
545                // Mapped vertices
546                final Point2d p1p = new Point2dImpl(p2.getX(), p1.getY()); // vertical
547                                                                                                                                        // above p2
548                final Point2d p2p = v.get(1); // don't move
549                final Point2d p3p = new Point2dImpl(p3.getX(), p2.getY()); // horizontal
550                                                                                                                                        // from p2
551                final Point2d p4p = new Point2dImpl(p3p.getX(), p1.getY()); // vertical
552                                                                                                                                        // above p3
553
554                pointPairs.add(new IndependentPair<Point2d, Point2d>(p1, p1p));
555                pointPairs.add(new IndependentPair<Point2d, Point2d>(p2, p2p));
556                pointPairs.add(new IndependentPair<Point2d, Point2d>(p3, p3p));
557                pointPairs.add(new IndependentPair<Point2d, Point2d>(p4, p4p));
558
559                return pointPairs;
560        }
561
562        /**
563         * Get the expansion value of the bounding boxes that are generated for the
564         * text regions.
565         *
566         * @return the new bounding box expansion multiplier.
567         */
568        public float getBoundingBoxPaddingPc()
569        {
570                return boundingBoxPaddingPc;
571        }
572
573        /**
574         * Set the expansion value for the subimage extraction.
575         *
576         * @param boundingBoxPaddingPc
577         *            the new multiplier
578         */
579        public void setBoundingBoxPaddingPc(float boundingBoxPaddingPc)
580        {
581                this.boundingBoxPaddingPc = boundingBoxPaddingPc;
582        }
583
584        /**
585         * Returns a map of bounding box to image and textual string.
586         *
587         * @return A map of image bounding box to subimage and text string.
588         */
589        @Override
590        public Map<Rectangle, FImage> getTextRegions()
591        {
592                return this.textRegions;
593        }
594}