001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.twitter.token.mode.pointwisemi.count; 031 032import java.io.BufferedInputStream; 033import java.io.BufferedOutputStream; 034import java.io.ByteArrayInputStream; 035import java.io.ByteArrayOutputStream; 036import java.io.DataInput; 037import java.io.DataInputStream; 038import java.io.DataOutput; 039import java.io.DataOutputStream; 040import java.io.IOException; 041import java.io.PrintWriter; 042import java.io.StringWriter; 043import java.util.Scanner; 044import java.util.regex.Matcher; 045import java.util.regex.Pattern; 046 047import org.openimaj.io.IOUtils; 048import org.openimaj.io.ReadWriteable; 049import org.openimaj.util.pair.IndependentPair; 050import org.openimaj.util.pair.Pair; 051 052/** 053 * A pair of strings with 2 distinct counts: 054 * <ul> 055 * <li>number of times the pair appears together in a document</li> 056 * </ul> 057 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 058 * 059 */ 060public class TokenPairCount extends Pair<String> implements ReadWriteable{ 061 062 private static final String TIMESPLIT = ".AT."; 063 private static Pattern timeSplitPattern = Pattern.compile(TIMESPLIT); 064 private static Pattern timePartPattern = Pattern.compile("T-?\\d+"); 065 private static Pattern timeIDPattern = Pattern.compile(".*T(.*?)" + Pattern.quote(TIMESPLIT) + "(.*)s",Pattern.DOTALL); 066 067 /** 068 * Number of times this pair appears together 069 */ 070 public long paircount; 071 public boolean isSingle; 072 073 /** 074 * 075 */ 076 public TokenPairCount() { 077 super(null,null); 078 this.isSingle = false; 079 } 080 081 /** 082 * @param tok1 083 * @param tok2 084 */ 085 public TokenPairCount(String tok1, String tok2) { 086 super(tok1, tok2); 087 isSingle = tok2 == null; 088 } 089 090 public TokenPairCount(String tok1) { 091 this(tok1,null); 092 } 093 094 @Override 095 public void readBinary(DataInput in) throws IOException { 096 this.isSingle = in.readBoolean(); 097 this.setFirstObject(in.readUTF()); 098 if(!isSingle) 099 this.setSecondObject(in.readUTF()); 100 this.paircount = in.readLong(); 101 } 102 103 @Override 104 public byte[] binaryHeader() { 105 return "B".getBytes(); 106 } 107 108 @Override 109 public void writeBinary(DataOutput out) throws IOException { 110 out.writeBoolean(this.isSingle); 111 out.writeUTF(this.firstObject()); 112 if(!this.isSingle) 113 out.writeUTF(this.secondObject()); 114 out.writeLong(paircount); 115 } 116 117 public void add(TokenPairCount that) { 118 this.paircount +=that.paircount; 119 120 } 121 122 /** 123 * @return identifier string without a count 124 */ 125 public String identifier(){ 126 long count = this.paircount; 127 this.paircount = 0; 128 String out = toString(); 129 this.paircount = count; 130 return out; 131 } 132 133 @Override 134 public String toString() { 135 StringWriter writer = new StringWriter(); 136 try { 137 IOUtils.writeASCII(writer, this); 138 } catch (IOException e) { 139 return "ERRORSTRING"; 140 } 141 return writer.toString(); 142 } 143 144 @Override 145 public void readASCII(Scanner in) throws IOException { 146 this.isSingle = Boolean.parseBoolean(in.nextLine()); 147 this.setFirstObject(in.nextLine()); 148 if(!this.isSingle){ 149 this.setSecondObject(in.nextLine()); 150 } 151 if(in.hasNextLine()) 152 this.paircount = Long.parseLong(in.nextLine()); 153 } 154 155 @Override 156 public String asciiHeader() { 157 return "A"; 158 } 159 160 @Override 161 public void writeASCII(PrintWriter out) throws IOException { 162 out.println(this.isSingle); 163 out.println(this.firstObject()); 164 if(!this.isSingle){ 165 out.println(this.secondObject()); 166 } 167 out.println(paircount); 168 } 169 170 /** 171 * Given a string, extract the time and TokenPairCount assuming the format: 172 * time + TokenPairCount#TIMESPLIT + {@link TokenPairCount#identifier()} 173 * @param string 174 * @return a time and TokenPairCount (with a zero count of course) 175 * @throws IOException 176 */ 177 public static IndependentPair<Long, TokenPairCount> parseTimeTokenID(String string) throws IOException { 178 Matcher matcher = timeIDPattern.matcher(string); 179 if(!matcher.matches()) 180 throw new IOException("Ivalid time ID"); 181 long time = Long.parseLong(matcher.group(1)); 182 TokenPairCount tpc = IOUtils.fromString(matcher.group(2), TokenPairCount.class); 183 return IndependentPair.pair(time, tpc); 184 } 185 186 public String identifier(long time) { 187 return "T" + time + TIMESPLIT + identifier(); 188 } 189 190 /** 191 * Generate a byte array identifier with some time stamp included. 192 * This function writes time then calls {@link #writeBinary(DataOutput)} 193 * @param time 194 * @return a byte array encoded as: time,{@link TokenPairCount} 195 * @throws IOException 196 */ 197 public byte[] identifierBinary(long time) throws IOException{ 198 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 199 DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(baos)); 200 dos.writeLong(time); 201 writeBinary(dos); 202 dos.flush(); 203 dos.close(); 204 return baos.toByteArray(); 205 } 206 207 public static long timeFromBinaryIdentity(byte[] bytes) throws IOException { 208 return timeFromBinaryIdentity(bytes,0,bytes.length); 209 } 210 211 public static long timeFromBinaryIdentity(byte[] bytes,int start, int length) throws IOException { 212 DataInputStream dis = null ; 213 try{ 214 dis = new DataInputStream(new BufferedInputStream(new ByteArrayInputStream(bytes,start,length))); 215 return dis.readLong(); 216 } 217 finally{ 218 dis.close(); 219 } 220 } 221 222}