001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.patterns;
031
032import java.util.ArrayList;
033import java.util.Arrays;
034import java.util.List;
035
036/**
037 *
038 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
039 *
040 */
041// @formatter:off
042public class PunctuationPatternProvider extends PatternProvider{
043
044        String[] PunctCharsList = new String[]{
045                "'","\\|","\\/","\\-",
046                "\u2026", // Ellipses
047                "\u201c", // open quote
048                "\u201d", // close quote
049                "\"",".","?","!",",",":",";","&","*",
050                "\u2018", // left quote
051                "\u2019", // right quote
052                "\u02BC", // another kind of apostrophe
053                "\\<",
054                "\\>",
055                "\u00AB",
056                "\u00BB",
057                "{",
058                "}",
059                "\\(",
060                "\\)",
061                "\\[",
062                "\\]",
063                "\\\\", "\\|","~","="
064        };
065//      private final String Punct;
066        private String charPuncs;
067
068        /**
069         *
070         */
071        public PunctuationPatternProvider() {
072                final String [] allpuncs = new String[PunctCharsList.length];
073                this.charPuncs = "[";
074                int i = 0;
075                for (final String punc : PunctCharsList) {
076                        allpuncs[i++] = String.format("[%s]+",punc);
077                        charPuncs += punc;
078                }
079                charPuncs+="]";
080//              this.Punct = String.format("%s", RegexUtil.regex_or_match(allpuncs));
081        }
082
083        @Override
084        public String patternString() {
085                return charPuncs + "+";
086        }
087
088        /**
089         * @return the pattern for each punctuation character
090         */
091        public String charPattern(){
092                return this.charPuncs;
093        }
094
095        /**
096         * @param toIgnore
097         * @return not some punctuation minus some characters
098         */
099        public List<String> notMinus(String ... toIgnore){
100                final List<String> allnotpuncs = new ArrayList<String>();
101                final List<String> ignoreArr = Arrays.asList(toIgnore);
102                for (final String punc : PunctCharsList) {
103                        if(ignoreArr.contains(punc)) continue;
104                        allnotpuncs.add(String.format("^%s",punc));
105                }
106                return allnotpuncs;
107        }
108
109
110}