001/** 002 * Copyright (c) 2012, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.twitter.utils; 031 032import java.io.DataInput; 033import java.io.DataOutput; 034import java.io.IOException; 035import java.io.PrintWriter; 036import java.util.Scanner; 037 038import org.openimaj.io.ReadWriteable; 039 040/** 041 * Convenience class which holds all the components required to calculate DF-IDF 042 * 043 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 044 * 045 */ 046public class WordDFIDF implements ReadWriteable, Comparable<WordDFIDF> { 047 /** 048 * Total number of tweets in all timeperiods 049 */ 050 public long Ttf; 051 /** 052 * Number of tweets containing this word in all timeperiods 053 */ 054 public long Twf; 055 /** 056 * Number of tweets in this timeperiod 057 */ 058 public long tf; 059 060 /** 061 * Number of tweets containing this word in this time period 062 */ 063 public long wf; 064 /** 065 * the measurment time period 066 */ 067 public long timeperiod; 068 069 /** 070 * Helpful for reading 071 */ 072 public WordDFIDF() { 073 Ttf = Twf = tf = wf = 0; 074 } 075 076 /** 077 * @param timeperiod 078 * the timeperiod 079 * @param wf 080 * Word count in this timeperiod 081 * @param tf 082 * Tweet count in this timeperiod 083 * @param twf 084 * Word count across all time 085 * @param ttf 086 * Tweet count across all time 087 */ 088 public WordDFIDF(long timeperiod, long wf, long tf, long twf, long ttf) { 089 this.timeperiod = timeperiod; 090 this.wf = wf; 091 this.tf = tf; 092 this.Twf = twf; 093 this.Ttf = ttf; 094 } 095 096 @Override 097 public void writeBinary(DataOutput out) throws IOException { 098 out.writeLong(timeperiod); 099 out.writeLong(wf); 100 out.writeLong(tf); 101 out.writeLong(Twf); 102 out.writeLong(Ttf); 103 } 104 105 @Override 106 public byte[] binaryHeader() { 107 return "".getBytes(); 108 } 109 110 @Override 111 public void readBinary(DataInput in) throws IOException { 112 timeperiod = in.readLong(); 113 wf = in.readLong(); 114 tf = in.readLong(); 115 Twf = in.readLong(); 116 Ttf = in.readLong(); 117 } 118 119 /** 120 * DF-IDF as defined by "Event Detection in Twitter by J. Weng et. al. 2011" 121 * 122 * @return the DF-IDF score 123 */ 124 public double dfidf() { 125 final double wf = this.wf; 126 final double tf = this.tf; 127 final double Twf = this.Twf; 128 final double Ttf = this.Ttf; 129 if (tf == 0 || Ttf == 0) 130 return 0; 131 132 return (wf / tf) * Math.log(Ttf / Twf); 133 } 134 135 @Override 136 public int compareTo(WordDFIDF other) { 137 return new Long(timeperiod).compareTo(other.timeperiod); 138 } 139 140 @Override 141 public boolean equals(Object obj) { 142 if (!(obj instanceof WordDFIDF)) 143 return false; 144 final WordDFIDF that = (WordDFIDF) obj; 145 return that.compareTo(this) == 0; 146 } 147 148 @Override 149 public int hashCode() { 150 return (int) (timeperiod ^ (timeperiod >>> 32)); 151 } 152 153 @Override 154 public String toString() { 155 final String format = "(wf=%s, tf=%s, Twf=%s, Ttf=%s, DFIDF=%.5f)"; 156 return String.format(format, wf, tf, Twf, Ttf, dfidf()); 157 } 158 159 @Override 160 public void readASCII(Scanner in) throws IOException { 161 this.timeperiod = in.nextLong(); 162 this.wf = in.nextLong(); 163 this.tf = in.nextLong(); 164 this.Twf = in.nextLong(); 165 this.Ttf = in.nextLong(); 166 } 167 168 @Override 169 public String asciiHeader() { 170 return ""; 171 } 172 173 @Override 174 public void writeASCII(PrintWriter out) throws IOException { 175 out.printf("%s %s %s %s %s", this.timeperiod, this.wf, this.tf, this.Twf, this.Ttf); 176 } 177}