001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.util.ArrayList;
033
034/**
035 * This class aims to return a list of possible rationalizations of a word that
036 * is out of vocabulary. Spell checking should have been used without success
037 * before attempting to use this tool. Currently it just removes excessive
038 * repetition.
039 * 
040 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
041 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
042 */
043public class Rationaliser {
044
045        ArrayList<Section> sections;
046
047        public Rationaliser() {
048        }
049
050        /**
051         * @param word
052         * @return list of the rationalised possibilities
053         */
054        public ArrayList<String> getCombinations(String word) {
055                final ArrayList<String> result = new ArrayList<String>();
056                final char[] characters = word.toCharArray();
057                sections = new ArrayList<Section>();
058
059                Character last = null;
060                char lastCount = 0;
061                StringBuffer unclassed = new StringBuffer();
062
063                // Build Sections
064                for (int i = 0; i < characters.length; i++) {
065                        final char current = characters[i];
066                        // Process first char for new unclassed section.
067                        if (last == null) {
068                                last = current;
069                                lastCount++;
070                                continue;
071                        }
072                        // if we get a repeat
073                        if (last.equals(current)) {
074                                // eat if it is a repeat repeat
075                                if (lastCount > 1)
076                                        continue;
077                                else {
078                                        // put valid buffer in section
079                                        if (unclassed.length() > 0) {
080                                                final StringBuffer pass = unclassed;
081                                                unclassed = new StringBuffer();
082                                                sections.add(new ValidSection(pass));
083                                        }
084                                        lastCount++;
085                                        final RepeatSection n = new RepeatSection(new StringBuffer(last.toString()));
086                                        sections.add(n);
087                                        continue;
088                                }
089                        }
090                        // if it is not a repeat
091                        if (lastCount == 1)
092                                unclassed.append(last);
093                        last = current;
094                        lastCount = 1;
095                }
096                if (lastCount == 1)
097                        unclassed.append(last);
098                if (unclassed.length() > 0)
099                        sections.add(new ValidSection(unclassed));
100
101                // get all the combinations from the sections
102                for (final StringBuffer sb : getSubCombinations(0)) {
103                        result.add(sb.toString());
104                }
105                return result;
106        }
107
108        private ArrayList<StringBuffer> getSubCombinations(int position) {
109                if (position == sections.size() - 1)
110                        return sections.get(position).getCombinations();
111                else {
112                        final ArrayList<StringBuffer> result = new ArrayList<StringBuffer>();
113                        for (final StringBuffer stringBuffer : sections.get(position).getCombinations()) {
114                                for (final StringBuffer stringBuffer2 : getSubCombinations(position + 1)) {
115                                        result.add(new StringBuffer(new StringBuffer(stringBuffer).append(stringBuffer2)));
116                                }
117                        }
118                        return result;
119                }
120        }
121
122        private abstract class Section {
123                public StringBuffer value;
124
125                public Section(StringBuffer value) {
126                        this.value = value;
127                }
128
129                public abstract ArrayList<StringBuffer> getCombinations();
130        }
131
132        private class ValidSection extends Section {
133                public ValidSection(StringBuffer value) {
134                        super(value);
135                }
136
137                @Override
138                public ArrayList<StringBuffer> getCombinations() {
139                        final ArrayList<StringBuffer> res = new ArrayList<StringBuffer>();
140                        res.add(new StringBuffer(this.value));
141                        return res;
142                }
143        }
144
145        private class RepeatSection extends Section {
146
147                public RepeatSection(StringBuffer value) {
148                        super(value);
149                }
150
151                @Override
152                public ArrayList<StringBuffer> getCombinations() {
153                        final ArrayList<StringBuffer> res = new ArrayList<StringBuffer>();
154                        res.add(new StringBuffer(this.value));
155                        final StringBuffer two = new StringBuffer(this.value);
156                        two.append(value.toString().toCharArray()[0]);
157                        res.add(two);
158                        return res;
159                }
160        }
161
162        public static void main(String[] args) {
163                final Rationaliser rc = new Rationaliser();
164                for (final String s : rc.getCombinations("BBBlaaaaddddiblah")) {
165                        System.out.println(s);
166                }
167        }
168
169}