package com.knowledgebooks.nlp;
import com.knowledgebooks.public_domain.Stemmer;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import com.knowledgebooks.nlp.util.NoiseWords;
/**
* This class stores stem count data for words in a document and provides
* an API to compare the similarity between this document and another.
*
* @author Mark Watson
*
*/
/**
* Copyright Mark Watson 2008-2010. All Rights Reserved.
* License: LGPL version 3 (http://www.gnu.org/licenses/lgpl-3.0.txt)
*/
public class ComparableDocument {
private ComparableDocument() {
} // disable default constructor calls
public ComparableDocument(File document) throws FileNotFoundException {
this(new Scanner(document).useDelimiter("\\Z").next());
}
public ComparableDocument(String text) {
// System.out.println("text:\n\n" + text + "\n\n");
List<String> stems = new Stemmer().stemString(text);
for (String stem : stems) {
if (!NoiseWords.checkFor(stem)) {
stem_count++;
if (stemCountMap.containsKey(stem)) {
Integer count = stemCountMap.get(stem);
stemCountMap.put(stem, 1 + count);
} else {
stemCountMap.put(stem, 1);
}
}
// System.out.println(stem + " : " + stemCountMap.get(stem));
}
}
public Map<String, Integer> getStemMap() {
return stemCountMap;
}
public int getStemCount() {
return stem_count;
}
public float compareTo(ComparableDocument otherDocument) {
long count = 0;
Map<String, Integer> map2 = otherDocument.getStemMap();
Iterator<String> iter = stemCountMap.keySet().iterator();
while (iter.hasNext()) {
String key = iter.next();
Integer count1 = stemCountMap.get(key);
Integer count2 = map2.get(key);
if (count1 != null && count2 != null) {
count += count1 + count2;
//System.out.println(key);
}
}
//System.out.println("stem_count="+stem_count);
return (float) Math.sqrt(((float) (count * count) / (double) (stem_count * otherDocument.getStemCount()))) / 2f;
}
private Map<String, Integer> stemCountMap = new HashMap<String, Integer>();
private int stem_count = 0;
// throw away test program:
public static void main(String[] args) throws FileNotFoundException {
ComparableDocument news1 = new ComparableDocument(new File("test_data/news_1.txt"));
ComparableDocument news2 = new ComparableDocument(new File("test_data/news_2.txt"));
ComparableDocument econ1 = new ComparableDocument(new File("test_data/economy_1.txt"));
ComparableDocument econ2 = new ComparableDocument(new File("test_data/economy_2.txt"));
System.out.println("news 1 - news1: " + news1.compareTo(news1));
System.out.println("news 1 - news2: " + news1.compareTo(news2));
System.out.println("news 2 - news2: " + news2.compareTo(news2));
System.out.println("news 1 - econ1: " + news1.compareTo(econ1));
System.out.println("econ 1 - econ1: " + econ1.compareTo(econ1));
System.out.println("news 1 - econ2: " + news1.compareTo(econ2));
System.out.println("news 2 - econ2: " + news2.compareTo(econ2));
System.out.println("econ 1 - econ2: " + econ1.compareTo(econ2));
System.out.println("econ 2 - econ2: " + econ2.compareTo(econ2));
}
}