View Javadoc

1   /**
2    * Copyright (c) 2011, The University of Southampton and the individual contributors.
3    * All rights reserved.
4    *
5    * Redistribution and use in source and binary forms, with or without modification,
6    * are permitted provided that the following conditions are met:
7    *
8    *   * 	Redistributions of source code must retain the above copyright notice,
9    * 	this list of conditions and the following disclaimer.
10   *
11   *   *	Redistributions in binary form must reproduce the above copyright notice,
12   * 	this list of conditions and the following disclaimer in the documentation
13   * 	and/or other materials provided with the distribution.
14   *
15   *   *	Neither the name of the University of Southampton nor the names of its
16   * 	contributors may be used to endorse or promote products derived from this
17   * 	software without specific prior written permission.
18   *
19   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21   * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22   * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23   * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26   * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28   * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29   */
30  package org.openimaj.pgm.util;
31  
32  import org.openimaj.feature.SparseIntFV;
33  
34  /**
35   * A document is a bag of words
36   * @author Sina Samangooei (ss@ecs.soton.ac.uk)
37   *
38   */
39  public class Document extends SparseIntFV{
40  
41  	/**
42  	 * 
43  	 */
44  	private static final long serialVersionUID = 2073772149798865435L;
45  	private int length;
46  	
47  	/**
48  	 * @param corpus documents from the same corpus share the same vocabulary size
49  	 */
50  	public Document(Corpus corpus) {
51  		super(corpus.vocabularySize());
52  	}
53  	
54  	/**
55  	 * @param vocabularySize the number of words in this vocabulary
56  	 */
57  	public Document(int vocabularySize) {
58  		super(vocabularySize);
59  	}
60  	/**
61  	 * @return the number of unique words in this document
62  	 */
63  	public int countUniqueWords(){
64  		return this.getVector().used();
65  	}
66  	
67  	
68  	@Override
69  	public int length(){
70  		return this.length;
71  	}
72  	
73  	/**
74  	 * sets a word in the document's count.
75  	 * @param word
76  	 * @param count
77  	 */
78  	public void setWordCount(int word, int count){
79  		if(this.getVector().isUsed(word)){
80  			this.length -= this.getVector().get(word);
81  		}
82  		this.length += this.getVector().set(word, count);
83  	}
84  
85  	
86  }