001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.twitter.token.mode.pointwisemi.count;
031
032import java.io.BufferedInputStream;
033import java.io.BufferedOutputStream;
034import java.io.ByteArrayInputStream;
035import java.io.ByteArrayOutputStream;
036import java.io.DataInput;
037import java.io.DataInputStream;
038import java.io.DataOutput;
039import java.io.DataOutputStream;
040import java.io.IOException;
041import java.io.PrintWriter;
042import java.io.StringWriter;
043import java.util.Scanner;
044import java.util.regex.Matcher;
045import java.util.regex.Pattern;
046
047import org.openimaj.io.IOUtils;
048import org.openimaj.io.ReadWriteable;
049import org.openimaj.util.pair.IndependentPair;
050import org.openimaj.util.pair.Pair;
051
052/**
053 * A pair of strings with 2 distinct counts: 
054 * <ul>
055 * <li>number of times the pair appears together in a document</li>
056 * </ul>
057 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
058 *
059 */
060public class TokenPairCount extends Pair<String> implements ReadWriteable{
061        
062        private static final String TIMESPLIT = ".AT.";
063        private static Pattern timeSplitPattern = Pattern.compile(TIMESPLIT);
064        private static Pattern timePartPattern = Pattern.compile("T-?\\d+");
065        private static Pattern timeIDPattern = Pattern.compile(".*T(.*?)" + Pattern.quote(TIMESPLIT) + "(.*)s",Pattern.DOTALL);
066        
067        /**
068         * Number of times this pair appears together
069         */
070        public long paircount;
071        public boolean isSingle;
072        
073        /**
074         * 
075         */
076        public TokenPairCount() {
077                super(null,null);
078                this.isSingle = false;
079        }
080        
081        /**
082         * @param tok1
083         * @param tok2
084         */
085        public TokenPairCount(String tok1, String tok2) {
086                super(tok1, tok2);
087                isSingle = tok2 == null;
088        }
089
090        public TokenPairCount(String tok1) {
091                this(tok1,null);
092        }
093
094        @Override
095        public void readBinary(DataInput in) throws IOException {
096                this.isSingle = in.readBoolean();
097                this.setFirstObject(in.readUTF());
098                if(!isSingle)
099                        this.setSecondObject(in.readUTF());
100                this.paircount = in.readLong();
101        }
102
103        @Override
104        public byte[] binaryHeader() {
105                return "B".getBytes();
106        }
107
108        @Override
109        public void writeBinary(DataOutput out) throws IOException {
110                out.writeBoolean(this.isSingle);
111                out.writeUTF(this.firstObject());
112                if(!this.isSingle)
113                        out.writeUTF(this.secondObject());
114                out.writeLong(paircount);
115        }
116
117        public void add(TokenPairCount that) {
118                this.paircount +=that.paircount;
119                
120        }
121        
122        /**
123         * @return identifier string without a count
124         */
125        public String identifier(){
126                long count = this.paircount;
127                this.paircount = 0;
128                String out = toString();
129                this.paircount = count;
130                return out;
131        }
132        
133        @Override
134        public String toString() {
135                StringWriter writer = new StringWriter();
136                try {
137                        IOUtils.writeASCII(writer, this);
138                } catch (IOException e) {
139                        return "ERRORSTRING";
140                }
141                return writer.toString();
142        }
143
144        @Override
145        public void readASCII(Scanner in) throws IOException {
146                this.isSingle = Boolean.parseBoolean(in.nextLine());
147                this.setFirstObject(in.nextLine());
148                if(!this.isSingle){
149                        this.setSecondObject(in.nextLine());
150                }
151                if(in.hasNextLine())
152                        this.paircount = Long.parseLong(in.nextLine());
153        }
154
155        @Override
156        public String asciiHeader() {
157                return "A";
158        }
159
160        @Override
161        public void writeASCII(PrintWriter out) throws IOException {
162                out.println(this.isSingle);
163                out.println(this.firstObject());
164                if(!this.isSingle){
165                        out.println(this.secondObject());
166                }
167                out.println(paircount);
168        }
169
170        /**
171         * Given a string, extract the time and TokenPairCount assuming the format:
172         * time + TokenPairCount#TIMESPLIT + {@link TokenPairCount#identifier()}
173         * @param string
174         * @return a time and TokenPairCount (with a zero count of course)
175         * @throws IOException 
176         */
177        public static IndependentPair<Long, TokenPairCount> parseTimeTokenID(String string) throws IOException {
178                Matcher matcher = timeIDPattern.matcher(string);
179                if(!matcher.matches()) 
180                        throw new IOException("Ivalid time ID");
181                long time = Long.parseLong(matcher.group(1));
182                TokenPairCount tpc = IOUtils.fromString(matcher.group(2), TokenPairCount.class);
183                return IndependentPair.pair(time, tpc);
184        }
185
186        public String identifier(long time) {
187                return "T" + time + TIMESPLIT + identifier();
188        }
189        
190        /**
191         * Generate a byte array identifier with some time stamp included. 
192         * This function writes time then calls {@link #writeBinary(DataOutput)}
193         * @param time
194         * @return a byte array encoded as: time,{@link TokenPairCount}
195         * @throws IOException
196         */
197        public byte[] identifierBinary(long time) throws IOException{
198                ByteArrayOutputStream baos = new ByteArrayOutputStream();
199                DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(baos));
200                dos.writeLong(time);
201                writeBinary(dos);
202                dos.flush();
203                dos.close();
204                return baos.toByteArray();
205        }
206
207        public static long timeFromBinaryIdentity(byte[] bytes) throws IOException {
208                return timeFromBinaryIdentity(bytes,0,bytes.length);
209        }
210        
211        public static long timeFromBinaryIdentity(byte[] bytes,int start, int length) throws IOException {
212                DataInputStream dis = null ;
213                try{
214                        dis = new DataInputStream(new BufferedInputStream(new ByteArrayInputStream(bytes,start,length)));
215                        return dis.readLong();
216                }
217                finally{
218                        dis.close();
219                }
220        }
221        
222}