001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.patterns;
031
032import java.util.ArrayList;
033import java.util.HashSet;
034import java.util.List;
035
036import org.apache.commons.lang.StringUtils;
037
038/**
039 * Match edge punctuations and correct them such that they can be matched by a
040 * simple space split
041 * 
042 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
043 * 
044 */
045public abstract class EdgePunctuationPatternProvider extends PatternProvider {
046
047        protected String EdgePunct;
048        protected String NotEdgePunct;
049        protected String StartPunc;
050        protected String EndPunc;
051
052        /**
053         * @param punctuation
054         */
055        public EdgePunctuationPatternProvider(PunctuationPatternProvider punctuation) {
056                final String[] EdgePunctArr = new String[] { "'", "\"", "\\*", "\u201c", "\u201d", "\u2018", "\u2019", "\\<",
057                                "\\>", "\u00AB", "\u00BB", "{", "}", "\\(", "\\)", "\\[", "\\]", "\\\\", "\\|" };
058                final HashSet<String> edgeSet = new HashSet<String>();
059                for (final String string : EdgePunctArr) {
060                        edgeSet.add(string);
061                }
062                EdgePunct = "[" + StringUtils.join(EdgePunctArr, "") + "]";
063                final List<String> puncArr = new ArrayList<String>();
064                for (final String punc : punctuation.PunctCharsList) {
065                        if (edgeSet.contains(punc))
066                                continue;
067                        puncArr.add(punc);
068                }
069
070                NotEdgePunct = "(?:[a-zA-Z0-9]|" + "[" + StringUtils.join(puncArr, "") + "\\-]" + ")";
071                // NotEdgePunct = "(?:[a-zA-Z0-9])";
072                StartPunc = "\\s|^|[.,]|" + "[a-zA-Z0-9]";
073                EndPunc = "\\s|$|[.,]|" + "[a-zA-Z0-9]";
074        }
075
076        /**
077         * @return the edge punctuation of the default
078         *         {@link EdgePunctuationPatternProvider}
079         */
080        public static String edgePuncPattern() {
081                return new EdgePunctuationPatternProvider(new PunctuationPatternProvider()) {
082                        @Override
083                        public String correctEdges(String s) {
084                                return null;
085                        }
086
087                        @Override
088                        public String patternString() {
089                                return null;
090                        }
091
092                }.EdgePunct;
093        }
094
095        /**
096         * @param s
097         * @return given a string, match the edge punctuation and deal with it
098         *         somehow
099         */
100        public abstract String correctEdges(String s);
101
102        /**
103         * Left edge punctuations. Construct the edge pattern with
104         * (StartPunc)(EdgePunct+)(NotEdgePunct) and replaces with: $1$2 $3.
105         * 
106         * This solves this problem: "Here is (bracketed string)" is replaced with:
107         * "Here is ( bracketed string)"
108         * 
109         * by this class
110         * 
111         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
112         * 
113         */
114        public static class Left extends EdgePunctuationPatternProvider {
115                protected String EdgePunctLeft;
116
117                /**
118                 * @see EdgePunctuationPatternProvider#EdgePunctuationPatternProvider(PunctuationPatternProvider)
119                 * @param punctuation
120                 */
121                public Left(PunctuationPatternProvider punctuation) {
122                        super(punctuation);
123                        EdgePunctLeft = String.format("(%s)(%s+)(%s)", StartPunc, EdgePunct, NotEdgePunct);
124                }
125
126                @Override
127                public String patternString() {
128                        return EdgePunctLeft;
129                }
130
131                @Override
132                public String correctEdges(String s) {
133                        // Matcher matcher = pattern().matcher(s);
134                        // while(matcher.find()){
135                        // System.out.println("Found RIGHT match: '" +
136                        // s.substring(matcher.start(),matcher.end()) + "'");
137                        // System.out.println("... ngroups: " + matcher.groupCount());
138                        // for(int i = 0; i < matcher.groupCount(); i++){
139                        // System.out.println("... ... '" + matcher.group(i) + "'");
140                        // }
141                        // }
142                        return pattern().matcher(s).replaceAll("$1$2 $3");
143                }
144
145        }
146
147        /**
148         * Left edge punctuations. Construct the edge pattern with
149         * (StartPunc)(EdgePunct+)(NotEdgePunct) and replaces with: $1 $2$3.
150         * 
151         * This solves this problem: "Here is (bracketed string)" is replaced with:
152         * "Here is (bracketed string )"
153         * 
154         * by this class
155         * 
156         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
157         * 
158         */
159        public static class Right extends EdgePunctuationPatternProvider {
160                protected String EdgePunctRight;
161
162                /**
163                 * @param punctuation
164                 *            currently unused
165                 */
166                public Right(PunctuationPatternProvider punctuation) {
167                        super(punctuation);
168                        EdgePunctRight = String.format("(%s)(%s+)(%s)", NotEdgePunct, EdgePunct, EndPunc);
169                        // System.out.println("Right match pattern: " + EdgePunctRight);
170                }
171
172                @Override
173                public String patternString() {
174                        return EdgePunctRight;
175                }
176
177                @Override
178                public String correctEdges(String s) {
179                        //
180                        // Matcher matcher = pattern().matcher(s);
181                        // while(matcher.find()){
182                        // System.out.println("Found RIGHT match: '" +
183                        // s.substring(matcher.start(),matcher.end()) + "'");
184                        // System.out.println("... ngroups: " + matcher.groupCount());
185                        // for(int i = 0; i < matcher.groupCount(); i++){
186                        // System.out.println("... ... '" + matcher.group(i) + "'");
187                        // }
188                        // }
189                        final String ret = pattern().matcher(s).replaceAll("$1 $2$3");
190                        return ret;
191                }
192
193        }
194
195        static PunctuationPatternProvider punctuation = new PunctuationPatternProvider();
196        static EdgePunctuationPatternProvider edgeleft = new EdgePunctuationPatternProvider.Left(punctuation);
197        static EdgePunctuationPatternProvider edgeright = new EdgePunctuationPatternProvider.Right(punctuation);
198
199        /**
200         * pads start/end brackets with a space so they can be correctly matched
201         * while not screwing up the rest of the text
202         * 
203         * @param text
204         * @return the corrected text
205         */
206        public static String fixedges(String text) {
207                String s = text;
208                s = edgeleft.correctEdges(s);
209                s = edgeright.correctEdges(s);
210                ;
211                return s;
212        }
213
214}