001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.twitter.token.mode.pointwisemi.count; 031 032import java.io.DataInput; 033import java.io.DataOutput; 034import java.io.IOException; 035import java.io.PrintWriter; 036import java.util.Scanner; 037 038/** 039 * A Pair count with a unary count for each item of the pair. 040 * 041 * The values here are the 3 counting functions: 042 * c(x,y) = Number of times the pair x and y were seen together 043 * c(x) = Number of times x was seen with ANY other token 044 * c(y) = Number of times y was seen with ANY other token 045 * 046 * 047@References(references = { 048 @Reference( 049 author = {Benjamin Van Durme and Ashwin Lall}, 050 title = {Streaming Pointwise Mutual Information}, 051 booktitle = {NIPS}, 052 year = {2009}, 053 pages = {1892-1900} 054 ) 055}) 056 * 057 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 058 * 059 * 060 */ 061public class TokenPairUnaryCount extends TokenPairCount{ 062 /** 063 * count of token 1 pairs 064 */ 065 public long tok1count; 066 /** 067 * count of token 2 pairs 068 */ 069 public long tok2count; 070 071 /** 072 * Convenience 073 */ 074 public TokenPairUnaryCount() { 075 } 076 /** 077 * @param tok1 the first token (x) 078 * @param tok2 the second token (y) 079 * @param paircount the count of the tokens together (c(x,y)) 080 * @param tok1count the count of the first token with any other token (c(x)) 081 * @param tok2count the count of the second token with any other token (c(y)) 082 */ 083 public TokenPairUnaryCount(String tok1, String tok2, long paircount, long tok1count, long tok2count){ 084 super(tok1,tok2); 085 this.paircount = paircount; 086 this.tok1count = tok1count; 087 this.tok2count = tok2count; 088 } 089 090 /** 091 * same as {@link TokenPairUnaryCount#TokenPairUnaryCount(String, String, long, long, long)} using the values from 092 * the {@link TokenPairCount} instance 093 * @param tpc 094 * @param tok1count 095 * @param tok2count 096 */ 097 public TokenPairUnaryCount(TokenPairCount tpc, long tok1count,long tok2count) { 098 this(tpc.firstObject(),tpc.secondObject(),tpc.paircount,tok1count,tok2count); 099 } 100 101 @Override 102 public void writeBinary(DataOutput out) throws IOException { 103 super.writeBinary(out); 104 out.writeLong(tok1count); 105 out.writeLong(tok2count); 106 } 107 108 @Override 109 public void readBinary(DataInput in) throws IOException { 110 super.readBinary(in); 111 this.tok1count = in.readLong(); 112 this.tok2count = in.readLong(); 113 } 114 115 @Override 116 public void writeASCII(PrintWriter out) throws IOException { 117 super.writeASCII(out); 118 out.println(this.tok1count); 119 out.println(this.tok2count); 120 } 121 @Override 122 public void readASCII(Scanner in) throws IOException { 123 super.readASCII(in); 124 this.tok1count = Long.parseLong(in.nextLine()); 125 this.tok2count = Long.parseLong(in.nextLine()); 126 } 127 128 /** 129 * Calculate the Pointwise mutual information score such that: 130 * PMI(x,y) = log( p(x,y) / ( p(x) p(y) ) ) 131 * where we can estimate the probabilities as: 132 * p(x,y) = c(x,y) / n 133 * p(x) = c(x) / n 134 * p(y) = c(y) / n 135 * 136 * where n is the total number of pairs observed 137 * 138 * @param n the total number of pairs observed 139 * @return the PMI estimate 140 */ 141 public double pmi(double n){ 142 return Math.log((this.paircount / n) / ( ( this.tok1count / n ) * ( this.tok2count / n ) )) ; 143 } 144}