001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.twitter.token.mode.pointwisemi.count;
031
032import java.io.DataInput;
033import java.io.DataOutput;
034import java.io.IOException;
035import java.io.PrintWriter;
036import java.util.Scanner;
037
038/**
039 * A Pair count with a unary count for each item of the pair.
040 * 
041 * The values here are the 3 counting functions:
042 * c(x,y) = Number of times the pair x and y were seen together
043 * c(x) = Number of times x was seen with ANY other token
044 * c(y) = Number of times y was seen with ANY other token
045 *
046 *
047@References(references = { 
048        @Reference(
049                author    = {Benjamin Van Durme and Ashwin Lall},
050                title     = {Streaming Pointwise Mutual Information},
051                booktitle = {NIPS},
052                year      = {2009},
053                pages     = {1892-1900}
054        )
055})
056 * 
057 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
058 *
059 *
060 */
061public class TokenPairUnaryCount extends TokenPairCount{
062        /**
063         * count of token 1 pairs
064         */
065        public long tok1count;
066        /**
067         * count of token 2 pairs
068         */
069        public long tok2count;
070
071        /**
072         * Convenience
073         */
074        public TokenPairUnaryCount() {
075        }
076        /**
077         * @param tok1 the first token (x)
078         * @param tok2 the second token (y)
079         * @param paircount the count of the tokens together (c(x,y))
080         * @param tok1count the count of the first token with any other token (c(x))
081         * @param tok2count the count of the second token with any other token (c(y))
082         */
083        public TokenPairUnaryCount(String tok1, String tok2, long paircount, long tok1count, long tok2count){
084                super(tok1,tok2);
085                this.paircount = paircount;
086                this.tok1count = tok1count;
087                this.tok2count = tok2count;
088        }
089        
090        /**
091         * same as {@link TokenPairUnaryCount#TokenPairUnaryCount(String, String, long, long, long)} using the values from
092         * the {@link TokenPairCount} instance
093         * @param tpc
094         * @param tok1count
095         * @param tok2count
096         */
097        public TokenPairUnaryCount(TokenPairCount tpc, long tok1count,long tok2count) {
098                this(tpc.firstObject(),tpc.secondObject(),tpc.paircount,tok1count,tok2count);
099        }
100        
101        @Override
102        public void writeBinary(DataOutput out) throws IOException {
103                super.writeBinary(out);
104                out.writeLong(tok1count);
105                out.writeLong(tok2count);
106        }
107        
108        @Override
109        public void readBinary(DataInput in) throws IOException {
110                super.readBinary(in);
111                this.tok1count = in.readLong();
112                this.tok2count = in.readLong();
113        }
114        
115        @Override
116        public void writeASCII(PrintWriter out) throws IOException {
117                super.writeASCII(out);
118                out.println(this.tok1count);
119                out.println(this.tok2count);
120        }
121        @Override
122        public void readASCII(Scanner in) throws IOException {
123                super.readASCII(in);
124                this.tok1count = Long.parseLong(in.nextLine());
125                this.tok2count = Long.parseLong(in.nextLine());
126        }
127        
128        /**
129         * Calculate the Pointwise mutual information score such that:
130         * PMI(x,y) = log( p(x,y) / ( p(x) p(y) ) )
131         * where we can estimate the probabilities as:
132         * p(x,y) = c(x,y) / n
133         * p(x) = c(x) / n
134         * p(y) = c(y) / n
135         * 
136         * where n is the total number of pairs observed
137         * 
138         * @param n the total number of pairs observed
139         * @return the PMI estimate 
140         */
141        public double pmi(double n){
142                return Math.log((this.paircount / n) / ( ( this.tok1count / n ) * ( this.tok2count / n ) )) ;
143        }
144}