Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.matlabio;
031
032import java.io.BufferedReader;
033import java.io.File;
034import java.io.FileInputStream;
035import java.io.IOException;
036import java.io.InputStreamReader;
037import java.io.StringReader;
038import java.util.ArrayList;
039import java.util.LinkedHashMap;
040import java.util.Map.Entry;
041
042import org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.TimeIndex;
043import org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.WordIndex;
044import org.openimaj.hadoop.tools.twitter.utils.WordDFIDF;
045import org.openimaj.util.pair.IndependentPair;
046
047import com.Ostermiller.util.CSVParser;
048import com.jmatio.io.MatFileWriter;
049import com.jmatio.types.MLArray;
050import com.jmatio.types.MLCell;
051import com.jmatio.types.MLChar;
052import com.jmatio.types.MLDouble;
053import com.jmatio.types.MLSparse;
054
055public class SparseCSVToMatlab {
056        static class WordTimeDFIDF {
057                int word;
058                int time;
059                WordDFIDF idf;
060        }
061
062        public static void main(String[] args) throws IOException {
063
064                String sparseCSVRoot = "/Users/ss/Development/data/TrendMiner/sheffield/2010/09/tweets.2010-09.24hours.top100k.sparsecsv";
065                String outfileName = "mat_file.mat";
066                if (args.length > 0) {
067                        sparseCSVRoot = args[0];
068                        if (args.length > 1) {
069                                outfileName = args[1];
070                        }
071                }
072
073                final LinkedHashMap<String, IndependentPair<Long, Long>> wordIndex = WordIndex.readWordCountLines(sparseCSVRoot);
074                final LinkedHashMap<Long, IndependentPair<Long, Long>> timeIndex = TimeIndex.readTimeCountLines(sparseCSVRoot);
075                System.out.println("Preparing matlab files");
076
077                final MLCell wordCell = new MLCell("words", new int[] { wordIndex.size(), 2 });
078                final MLCell timeCell = new MLCell("times", new int[] { timeIndex.size(), 2 });
079
080                System.out.println("... reading times");
081                for (final Entry<Long, IndependentPair<Long, Long>> ent : timeIndex.entrySet()) {
082                        final long time = ent.getKey();
083                        final int timeCellIndex = (int) (long) ent.getValue().secondObject();
084                        final long count = ent.getValue().firstObject();
085                        timeCell.set(new MLDouble(null, new double[][] { new double[] { time } }), timeCellIndex, 0);
086                        timeCell.set(new MLDouble(null, new double[][] { new double[] { count } }), timeCellIndex, 1);
087                }
088
089                System.out.println("... reading words");
090                for (final Entry<String, IndependentPair<Long, Long>> ent : wordIndex.entrySet()) {
091                        final String word = ent.getKey();
092                        final int wordCellIndex = (int) (long) ent.getValue().secondObject();
093                        final long count = ent.getValue().firstObject();
094                        wordCell.set(new MLChar(null, word), wordCellIndex, 0);
095                        wordCell.set(new MLDouble(null, new double[][] { new double[] { count } }), wordCellIndex, 1);
096                }
097
098                System.out.println("... preapring values array");
099                final File valuesIn = new File(sparseCSVRoot, "values/part-r-00000");
100                BufferedReader reader = null;
101
102                try {
103                        reader = new BufferedReader(new InputStreamReader(new FileInputStream(valuesIn), "UTF-8"));
104                        final int nValues = wordIndex.size() * timeIndex.size();
105                        final MLSparse matarr = new MLSparse("values", new int[] { wordIndex.size(), timeIndex.size() }, 0, nValues);
106                        System.out.println("... reading values");
107                        String wholeLine = null;
108                        while ((wholeLine = reader.readLine()) != null) {
109                                final StringReader strReader = new StringReader(wholeLine);
110                                final CSVParser parser = new CSVParser(strReader);
111                                final String[] line = parser.getLine();
112                                if (line == null) {
113                                        continue;
114                                }
115                                final WordTimeDFIDF wtd = new WordTimeDFIDF();
116                                wtd.word = Integer.parseInt(line[0]);
117                                wtd.time = Integer.parseInt(line[1]);
118                                wtd.idf = new WordDFIDF();
119                                wtd.idf.timeperiod = timeCell.getIndex(wtd.time, 0);
120                                wtd.idf.wf = Integer.parseInt(line[2]);
121                                wtd.idf.tf = Integer.parseInt(line[3]);
122                                wtd.idf.Twf = Integer.parseInt(line[4]);
123                                wtd.idf.Ttf = Integer.parseInt(line[5]);
124
125                                matarr.set(wtd.idf.dfidf(), wtd.word, wtd.time);
126                        }
127                        System.out.println("writing!");
128                        final ArrayList<MLArray> list = new ArrayList<MLArray>();
129                        list.add(wordCell);
130                        list.add(timeCell);
131                        list.add(matarr);
132                        new MatFileWriter(sparseCSVRoot + File.separator + outfileName, list);
133                } finally {
134                        reader.close();
135                }
136        }
137}