/** * Copyright 2015, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.util; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.PrintStream; import java.util.List; import edu.emory.clir.clearnlp.collection.map.ObjectIntHashMap; import edu.emory.clir.clearnlp.collection.ngram.Unigram; import edu.emory.clir.clearnlp.collection.pair.ObjectIntPair; import edu.emory.clir.clearnlp.util.constant.PatternConst; /** * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class TFIDF { Unigram<String> term_frequencies; Unigram<String> document_frequencies; public TFIDF() { term_frequencies = new Unigram<>(); document_frequencies = new Unigram<>(); } static public ObjectIntHashMap<String> getDocumentFrequencyCounts(List<String> filenames) throws FileNotFoundException { ObjectIntHashMap<String> map = new ObjectIntHashMap<>(); for (String filename : filenames) for (String s : DSUtils.getBagOfWords(new FileInputStream(filename), PatternConst.WHITESPACES)) map.add(s); return map; } static public void main(String[] args) throws FileNotFoundException { List<String> filenames = FileUtils.getFileList(args[0], ".txt", false); ObjectIntHashMap<String> map = getDocumentFrequencyCounts(filenames); List<ObjectIntPair<String>> list = map.toList(); DSUtils.sortReverseOrder(list); PrintStream fout = IOUtils.createBufferedPrintStream(args[1]); int size = filenames.size(); System.out.println(size); for (ObjectIntPair<String> p : list) fout.printf("%s\t%d\t%6.4f\n", p.o, p.i, MathUtils.divide(p.i,size)); } }