001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.pgm.util;
031
032import gnu.trove.map.hash.TIntIntHashMap;
033import gnu.trove.procedure.TIntIntProcedure;
034
035import java.io.IOException;
036import java.io.InputStream;
037import java.util.ArrayList;
038import java.util.HashMap;
039import java.util.List;
040import java.util.Map;
041
042import org.openimaj.io.FileUtils;
043
044/**
045 * A corpus from a document whose lines are documents and whose words are
046 * seperated by a space
047 * 
048 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
049 * 
050 */
051public class SimpleCorpusReader implements CorpusReader {
052
053        private InputStream is;
054
055        /**
056         * @param resourceAsStream
057         */
058        public SimpleCorpusReader(InputStream resourceAsStream) {
059                is = resourceAsStream;
060        }
061
062        @Override
063        public Corpus readCorpus() throws IOException {
064                final String[] lines = FileUtils.readlines(is);
065                final Map<String, Integer> vocabulary = new HashMap<String, Integer>();
066                final List<TIntIntHashMap> docs = new ArrayList<TIntIntHashMap>();
067                for (final String docLine : lines) {
068                        final String[] words = docLine.split(" ");
069                        final TIntIntHashMap d = new TIntIntHashMap();
070                        for (final String word : words) {
071                                Integer value = 0;
072                                if ((value = vocabulary.get(word)) == null) {
073                                        vocabulary.put(word, value = vocabulary.size());
074                                }
075                                d.adjustValue(value, 1);
076                        }
077                        docs.add(d);
078                }
079                final Corpus c = new Corpus(vocabulary.size());
080                for (final TIntIntHashMap doc : docs) {
081                        final Document d = new Document(c);
082                        doc.forEachEntry(new TIntIntProcedure() {
083                                @Override
084                                public boolean execute(int word, int count) {
085                                        d.values.set(word, count);
086                                        return true;
087                                }
088                        });
089                        c.addDocument(d);
090                }
091
092                return c;
093        }
094
095}