/*
* This file is part of Caliph & Emir.
*
* Caliph & Emir is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Caliph & Emir is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Caliph & Emir; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Copyright statement:
* --------------------
* (c) 2002-2005 by Mathias Lux (mathias@juggle.at)
* http://www.juggle.at, http://caliph-emir.sourceforge.net
*/
package at.lux.retrieval.clustering.suffixtree;
import java.util.HashMap;
import java.util.HashSet;
/**
* @author Mathias Lux, mathias@juggle.at
* Date: 03.06.2004
* Time: 16:17:52
*/
public class WordIndex {
private HashMap<String, HashSet<StcDocument>> index;
private HashSet<StcDocument> documents;
public WordIndex() {
index = new HashMap<String, HashSet<StcDocument>>();
documents = new HashSet<StcDocument>();
}
public void addToIndex(String[] tokens, StcDocument stcDocument) {
documents.add(stcDocument);
for (int i = 0; i < tokens.length; i++) {
String token = tokens[i];
if (!index.containsKey(token)) {
HashSet<StcDocument> documents = new HashSet<StcDocument>();
index.put(token, documents);
}
index.get(token).add(stcDocument);
}
}
public int getDocumentCount(String token) {
if (!index.containsKey(token))
return 0;
else
return index.get(token).size();
}
public int getNumberOfDocuments() {
return documents.size();
}
/**
* Returns a double in [0,1] giving the percentage in how many docs this terms occurs.
*
* @param token
* @return
*/
public double getDocumentFrequency(String token) {
double numDocs = (double) getNumberOfDocuments();
double numDocsWithTerm = (double) getDocumentCount(token);
return numDocsWithTerm / numDocs;
}
}