/** * Copyright 2013-2015 Pierre Merienne * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.pmerienne.trident.ml.nlp; import java.io.Serializable; import java.util.HashMap; import java.util.List; import java.util.Map; import com.github.pmerienne.trident.ml.util.MathUtil; public class TFIDF implements TextFeaturesExtractor, Serializable { private static final long serialVersionUID = -6758089189650946158L; private Integer corpusSize; private Map<String, Double> termsInverseDocumentFrequencies; public TFIDF() { } public TFIDF(List<List<String>> documents, int featureSize) { this.init(documents, featureSize); } @Override public double[] extractFeatures(List<String> documentTerms) { double[] features = new double[this.termsInverseDocumentFrequencies.size()]; int i = 0; for (String term : this.termsInverseDocumentFrequencies.keySet()) { features[i] = this.tfIdf(term, documentTerms); i++; } return MathUtil.normalize(features); } public void init(List<List<String>> documents, int featureSize) { // Init vocabulary Vocabulary vocabulary = new Vocabulary(); for (List<String> document : documents) { vocabulary.addAll(document); } vocabulary.limitWords(featureSize); // Calculates idfs this.corpusSize = documents.size(); this.termsInverseDocumentFrequencies = new HashMap<String, Double>(vocabulary.wordCount()); for (String term : vocabulary) { double idf = this.idf(term, documents); this.termsInverseDocumentFrequencies.put(term, idf); } } protected double tf(String term, List<String> documentTerms) { double tf = 0.0; for (String documentTerm : documentTerms) { if (documentTerm.equals(term)) { tf++; } } return tf; } protected double idf(String term, List<List<String>> documents) { // number of documents where term appears double d = 0.0; for (List<String> document : documents) { if (document.contains(term)) { d++; } } return Math.log(this.corpusSize / (1 + d)); } protected double tfIdf(String term, List<String> documentTerms) { double idf = this.termsInverseDocumentFrequencies.containsKey(term) ? this.termsInverseDocumentFrequencies.get(term) : Math.log(this.corpusSize); double tf = this.tf(term, documentTerms); return tf * idf; } public Integer getCorpusSize() { return corpusSize; } public void setCorpusSize(Integer corpusSize) { this.corpusSize = corpusSize; } public Map<String, Double> getTermsInverseDocumentFrequencies() { return termsInverseDocumentFrequencies; } public void setTermsInverseDocumentFrequencies(Map<String, Double> termsInverseDocumentFrequencies) { this.termsInverseDocumentFrequencies = termsInverseDocumentFrequencies; } }