001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.matlabio; 031 032import java.io.BufferedReader; 033import java.io.File; 034import java.io.FileInputStream; 035import java.io.IOException; 036import java.io.InputStreamReader; 037import java.io.StringReader; 038import java.util.ArrayList; 039import java.util.LinkedHashMap; 040import java.util.Map.Entry; 041 042import org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.TimeIndex; 043import org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.WordIndex; 044import org.openimaj.hadoop.tools.twitter.utils.WordDFIDF; 045import org.openimaj.util.pair.IndependentPair; 046 047import com.Ostermiller.util.CSVParser; 048import com.jmatio.io.MatFileWriter; 049import com.jmatio.types.MLArray; 050import com.jmatio.types.MLCell; 051import com.jmatio.types.MLChar; 052import com.jmatio.types.MLDouble; 053import com.jmatio.types.MLSparse; 054 055public class SparseCSVToMatlab { 056 static class WordTimeDFIDF { 057 int word; 058 int time; 059 WordDFIDF idf; 060 } 061 062 public static void main(String[] args) throws IOException { 063 064 String sparseCSVRoot = "/Users/ss/Development/data/TrendMiner/sheffield/2010/09/tweets.2010-09.24hours.top100k.sparsecsv"; 065 String outfileName = "mat_file.mat"; 066 if (args.length > 0) { 067 sparseCSVRoot = args[0]; 068 if (args.length > 1) { 069 outfileName = args[1]; 070 } 071 } 072 073 final LinkedHashMap<String, IndependentPair<Long, Long>> wordIndex = WordIndex.readWordCountLines(sparseCSVRoot); 074 final LinkedHashMap<Long, IndependentPair<Long, Long>> timeIndex = TimeIndex.readTimeCountLines(sparseCSVRoot); 075 System.out.println("Preparing matlab files"); 076 077 final MLCell wordCell = new MLCell("words", new int[] { wordIndex.size(), 2 }); 078 final MLCell timeCell = new MLCell("times", new int[] { timeIndex.size(), 2 }); 079 080 System.out.println("... reading times"); 081 for (final Entry<Long, IndependentPair<Long, Long>> ent : timeIndex.entrySet()) { 082 final long time = ent.getKey(); 083 final int timeCellIndex = (int) (long) ent.getValue().secondObject(); 084 final long count = ent.getValue().firstObject(); 085 timeCell.set(new MLDouble(null, new double[][] { new double[] { time } }), timeCellIndex, 0); 086 timeCell.set(new MLDouble(null, new double[][] { new double[] { count } }), timeCellIndex, 1); 087 } 088 089 System.out.println("... reading words"); 090 for (final Entry<String, IndependentPair<Long, Long>> ent : wordIndex.entrySet()) { 091 final String word = ent.getKey(); 092 final int wordCellIndex = (int) (long) ent.getValue().secondObject(); 093 final long count = ent.getValue().firstObject(); 094 wordCell.set(new MLChar(null, word), wordCellIndex, 0); 095 wordCell.set(new MLDouble(null, new double[][] { new double[] { count } }), wordCellIndex, 1); 096 } 097 098 System.out.println("... preapring values array"); 099 final File valuesIn = new File(sparseCSVRoot, "values/part-r-00000"); 100 BufferedReader reader = null; 101 102 try { 103 reader = new BufferedReader(new InputStreamReader(new FileInputStream(valuesIn), "UTF-8")); 104 final int nValues = wordIndex.size() * timeIndex.size(); 105 final MLSparse matarr = new MLSparse("values", new int[] { wordIndex.size(), timeIndex.size() }, 0, nValues); 106 System.out.println("... reading values"); 107 String wholeLine = null; 108 while ((wholeLine = reader.readLine()) != null) { 109 final StringReader strReader = new StringReader(wholeLine); 110 final CSVParser parser = new CSVParser(strReader); 111 final String[] line = parser.getLine(); 112 if (line == null) { 113 continue; 114 } 115 final WordTimeDFIDF wtd = new WordTimeDFIDF(); 116 wtd.word = Integer.parseInt(line[0]); 117 wtd.time = Integer.parseInt(line[1]); 118 wtd.idf = new WordDFIDF(); 119 wtd.idf.timeperiod = timeCell.getIndex(wtd.time, 0); 120 wtd.idf.wf = Integer.parseInt(line[2]); 121 wtd.idf.tf = Integer.parseInt(line[3]); 122 wtd.idf.Twf = Integer.parseInt(line[4]); 123 wtd.idf.Ttf = Integer.parseInt(line[5]); 124 125 matarr.set(wtd.idf.dfidf(), wtd.word, wtd.time); 126 } 127 System.out.println("writing!"); 128 final ArrayList<MLArray> list = new ArrayList<MLArray>(); 129 list.add(wordCell); 130 list.add(timeCell); 131 list.add(matarr); 132 new MatFileWriter(sparseCSVRoot + File.separator + outfileName, list); 133 } finally { 134 reader.close(); 135 } 136 } 137}