001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.patterns; 031 032import java.util.ArrayList; 033import java.util.HashSet; 034import java.util.List; 035 036import org.apache.commons.lang.StringUtils; 037 038/** 039 * Match edge punctuations and correct them such that they can be matched by a 040 * simple space split 041 * 042 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 043 * 044 */ 045public abstract class EdgePunctuationPatternProvider extends PatternProvider { 046 047 protected String EdgePunct; 048 protected String NotEdgePunct; 049 protected String StartPunc; 050 protected String EndPunc; 051 052 /** 053 * @param punctuation 054 */ 055 public EdgePunctuationPatternProvider(PunctuationPatternProvider punctuation) { 056 final String[] EdgePunctArr = new String[] { "'", "\"", "\\*", "\u201c", "\u201d", "\u2018", "\u2019", "\\<", 057 "\\>", "\u00AB", "\u00BB", "{", "}", "\\(", "\\)", "\\[", "\\]", "\\\\", "\\|" }; 058 final HashSet<String> edgeSet = new HashSet<String>(); 059 for (final String string : EdgePunctArr) { 060 edgeSet.add(string); 061 } 062 EdgePunct = "[" + StringUtils.join(EdgePunctArr, "") + "]"; 063 final List<String> puncArr = new ArrayList<String>(); 064 for (final String punc : punctuation.PunctCharsList) { 065 if (edgeSet.contains(punc)) 066 continue; 067 puncArr.add(punc); 068 } 069 070 NotEdgePunct = "(?:[a-zA-Z0-9]|" + "[" + StringUtils.join(puncArr, "") + "\\-]" + ")"; 071 // NotEdgePunct = "(?:[a-zA-Z0-9])"; 072 StartPunc = "\\s|^|[.,]|" + "[a-zA-Z0-9]"; 073 EndPunc = "\\s|$|[.,]|" + "[a-zA-Z0-9]"; 074 } 075 076 /** 077 * @return the edge punctuation of the default 078 * {@link EdgePunctuationPatternProvider} 079 */ 080 public static String edgePuncPattern() { 081 return new EdgePunctuationPatternProvider(new PunctuationPatternProvider()) { 082 @Override 083 public String correctEdges(String s) { 084 return null; 085 } 086 087 @Override 088 public String patternString() { 089 return null; 090 } 091 092 }.EdgePunct; 093 } 094 095 /** 096 * @param s 097 * @return given a string, match the edge punctuation and deal with it 098 * somehow 099 */ 100 public abstract String correctEdges(String s); 101 102 /** 103 * Left edge punctuations. Construct the edge pattern with 104 * (StartPunc)(EdgePunct+)(NotEdgePunct) and replaces with: $1$2 $3. 105 * 106 * This solves this problem: "Here is (bracketed string)" is replaced with: 107 * "Here is ( bracketed string)" 108 * 109 * by this class 110 * 111 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 112 * 113 */ 114 public static class Left extends EdgePunctuationPatternProvider { 115 protected String EdgePunctLeft; 116 117 /** 118 * @see EdgePunctuationPatternProvider#EdgePunctuationPatternProvider(PunctuationPatternProvider) 119 * @param punctuation 120 */ 121 public Left(PunctuationPatternProvider punctuation) { 122 super(punctuation); 123 EdgePunctLeft = String.format("(%s)(%s+)(%s)", StartPunc, EdgePunct, NotEdgePunct); 124 } 125 126 @Override 127 public String patternString() { 128 return EdgePunctLeft; 129 } 130 131 @Override 132 public String correctEdges(String s) { 133 // Matcher matcher = pattern().matcher(s); 134 // while(matcher.find()){ 135 // System.out.println("Found RIGHT match: '" + 136 // s.substring(matcher.start(),matcher.end()) + "'"); 137 // System.out.println("... ngroups: " + matcher.groupCount()); 138 // for(int i = 0; i < matcher.groupCount(); i++){ 139 // System.out.println("... ... '" + matcher.group(i) + "'"); 140 // } 141 // } 142 return pattern().matcher(s).replaceAll("$1$2 $3"); 143 } 144 145 } 146 147 /** 148 * Left edge punctuations. Construct the edge pattern with 149 * (StartPunc)(EdgePunct+)(NotEdgePunct) and replaces with: $1 $2$3. 150 * 151 * This solves this problem: "Here is (bracketed string)" is replaced with: 152 * "Here is (bracketed string )" 153 * 154 * by this class 155 * 156 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 157 * 158 */ 159 public static class Right extends EdgePunctuationPatternProvider { 160 protected String EdgePunctRight; 161 162 /** 163 * @param punctuation 164 * currently unused 165 */ 166 public Right(PunctuationPatternProvider punctuation) { 167 super(punctuation); 168 EdgePunctRight = String.format("(%s)(%s+)(%s)", NotEdgePunct, EdgePunct, EndPunc); 169 // System.out.println("Right match pattern: " + EdgePunctRight); 170 } 171 172 @Override 173 public String patternString() { 174 return EdgePunctRight; 175 } 176 177 @Override 178 public String correctEdges(String s) { 179 // 180 // Matcher matcher = pattern().matcher(s); 181 // while(matcher.find()){ 182 // System.out.println("Found RIGHT match: '" + 183 // s.substring(matcher.start(),matcher.end()) + "'"); 184 // System.out.println("... ngroups: " + matcher.groupCount()); 185 // for(int i = 0; i < matcher.groupCount(); i++){ 186 // System.out.println("... ... '" + matcher.group(i) + "'"); 187 // } 188 // } 189 final String ret = pattern().matcher(s).replaceAll("$1 $2$3"); 190 return ret; 191 } 192 193 } 194 195 static PunctuationPatternProvider punctuation = new PunctuationPatternProvider(); 196 static EdgePunctuationPatternProvider edgeleft = new EdgePunctuationPatternProvider.Left(punctuation); 197 static EdgePunctuationPatternProvider edgeright = new EdgePunctuationPatternProvider.Right(punctuation); 198 199 /** 200 * pads start/end brackets with a space so they can be correctly matched 201 * while not screwing up the rest of the text 202 * 203 * @param text 204 * @return the corrected text 205 */ 206 public static String fixedges(String text) { 207 String s = text; 208 s = edgeleft.correctEdges(s); 209 s = edgeright.correctEdges(s); 210 ; 211 return s; 212 } 213 214}