1 /** 2 * Copyright (c) 2011, The University of Southampton and the individual contributors. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without modification, 6 * are permitted provided that the following conditions are met: 7 * 8 * * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 * 11 * * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * * Neither the name of the University of Southampton nor the names of its 16 * contributors may be used to endorse or promote products derived from this 17 * software without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 package org.openimaj.pgm.util; 31 32 import org.openimaj.feature.SparseIntFV; 33 34 /** 35 * A document is a bag of words 36 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 37 * 38 */ 39 public class Document extends SparseIntFV{ 40 41 /** 42 * 43 */ 44 private static final long serialVersionUID = 2073772149798865435L; 45 private int length; 46 47 /** 48 * @param corpus documents from the same corpus share the same vocabulary size 49 */ 50 public Document(Corpus corpus) { 51 super(corpus.vocabularySize()); 52 } 53 54 /** 55 * @param vocabularySize the number of words in this vocabulary 56 */ 57 public Document(int vocabularySize) { 58 super(vocabularySize); 59 } 60 /** 61 * @return the number of unique words in this document 62 */ 63 public int countUniqueWords(){ 64 return this.getVector().used(); 65 } 66 67 68 @Override 69 public int length(){ 70 return this.length; 71 } 72 73 /** 74 * sets a word in the document's count. 75 * @param word 76 * @param count 77 */ 78 public void setWordCount(int word, int count){ 79 if(this.getVector().isUsed(word)){ 80 this.length -= this.getVector().get(word); 81 } 82 this.length += this.getVector().set(word, count); 83 } 84 85 86 }