package nlp.com.knowledgebooks.nlp; import nlp.com.knowledgebooks.nlp.util.NoiseWords; import nlp.public_domain.Stemmer; import java.io.File; import java.io.FileNotFoundException; import java.util.*; /** * This class stores stem count data for words in a document and provides * an API to compare the similarity between this document and another. * * @author Mark Watson * * <p/> * Copyright 1998-2012 by Mark Watson. All rights reserved. * <p/> * This software is can be used under either of the following licenses: * <p/> * 1. LGPL v3<br/> * 2. Apache 2 * <p/> * */ public class ComparableDocument { private ComparableDocument() { } // disable default constructor calls public ComparableDocument(File document) throws FileNotFoundException { this(new Scanner(document).useDelimiter("\\Z").next()); } public ComparableDocument(String text) { // System.out.println("text:\n\n" + text + "\n\n"); List<String> stems = new Stemmer().stemString(text); for (String stem : stems) { if (!NoiseWords.checkFor(stem)) { stem_count++; if (stemCountMap.containsKey(stem)) { Integer count = stemCountMap.get(stem); stemCountMap.put(stem, 1 + count); } else { stemCountMap.put(stem, 1); } } // System.out.println(stem + " : " + stemCountMap.get(stem)); } } public Map<String, Integer> getStemMap() { return stemCountMap; } public int getStemCount() { return stem_count; } public float compareTo(ComparableDocument otherDocument) { long count = 0; Map<String, Integer> map2 = otherDocument.getStemMap(); Iterator<String> iter = stemCountMap.keySet().iterator(); while (iter.hasNext()) { String key = iter.next(); Integer count1 = stemCountMap.get(key); Integer count2 = map2.get(key); if (count1!=null && count2!=null) { count += count1 + count2; //System.out.println(key); } } //System.out.println("stem_count="+stem_count); return (float) Math.sqrt(((float)(count*count) / (double)(stem_count * otherDocument.getStemCount()))) / 2f; } private Map<String, Integer> stemCountMap = new HashMap<String, Integer>(); private int stem_count = 0; // throw away test program: public static void main(String[] args) throws FileNotFoundException { ComparableDocument news1 = new ComparableDocument(new File("test_data/news_1.txt")); ComparableDocument news2 = new ComparableDocument(new File("test_data/news_2.txt")); ComparableDocument econ1 = new ComparableDocument(new File("test_data/economy_1.txt")); ComparableDocument econ2 = new ComparableDocument(new File("test_data/economy_2.txt")); System.out.println("news 1 - news1: " + news1.compareTo(news1)); System.out.println("news 1 - news2: " + news1.compareTo(news2)); System.out.println("news 2 - news2: " + news2.compareTo(news2)); System.out.println("news 1 - econ1: " + news1.compareTo(econ1)); System.out.println("econ 1 - econ1: " + econ1.compareTo(econ1)); System.out.println("news 1 - econ2: " + news1.compareTo(econ2)); System.out.println("news 2 - econ2: " + news2.compareTo(econ2)); System.out.println("econ 1 - econ2: " + econ1.compareTo(econ2)); System.out.println("econ 2 - econ2: " + econ2.compareTo(econ2)); } }