/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
/**
* Class for TFIDF analysis
* @author Themistoklis Mavridis
*/
public class TFIDF {
/**
* a list with the top words recognized by TFIDF
*/
protected List<String> topWordsList;
/**
* Method to calculate TF score
* @param Doc the document to analyze
* @param termToCheck the term to calculate tf for
* @return th TF score
*/
public double tfCalculator(String Doc, String termToCheck) {
double count = 0; //to count the overall occurrence of the term termToCheck
String[] tokenizedTerms = Doc.toString().replaceAll("[\\W&&[^\\s]]", "").split("\\W+"); //to get individual terms
for (String s : tokenizedTerms) {
if (s.equalsIgnoreCase(termToCheck)) {
count++;
}
}
double tfvalue= Math.pow((count / tokenizedTerms.length),0.5);
return tfvalue;
}
/**
* Method to calculate idf score
* @param allwordsList all the words
* @param termToCheck the term to check for
* @param NumberOfDocs the number of documents we analyze
* @return the idf score
*/
public double idfCalculator(List<List<String>> allwordsList, String termToCheck, int NumberOfDocs) {
double count = 0;
for (List<String> wordList : allwordsList){
for (String s : wordList){
if(s.equalsIgnoreCase(termToCheck)){
count++;
break;
}
}
}
double output=1+Math.log(NumberOfDocs/ (1+count));
return output;
}
/**
* Method to compute the TFIDF score
* @param allDocs all the documents to analyze
* @param topWords the amount of top words to get
* @param directory the directory to save the output
* @return a list with the top words
*/
public List<String> compute(String[] allDocs,int topWords, String directory){
try{
List<List<String>> allwordsList = new ArrayList<>();
int counterwords=0;
int negtfidf=0;
for(int i=0;i<allDocs.length;i++){
List<String> allwordsList_single = new ArrayList<>();
if(!(allDocs[i]==null)){
String stringtosplit = allDocs[i];
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
stringtosplit=stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
String[] tokenizedTerms=stringtosplit.split("\\W+");
for(int j=0;j<tokenizedTerms.length;j++){
if(!(tokenizedTerms[j]==null)&&(!(tokenizedTerms[j].equalsIgnoreCase("")))){
allwordsList_single.add(tokenizedTerms[j]);
counterwords++;
}
}
}
}
}
allwordsList.add(i,allwordsList_single);
}
HashMap<String, Double> wordTFIDFscores = new HashMap<>();
List<String> topwordsTFIDF;
topwordsTFIDF = new ArrayList<>();
List<String> wordsTFIDF=new ArrayList<>();
List<Double> TFIDFscoreslist;
List<Double> TFIDFscoreslistcopy=new ArrayList<>();
TFIDFscoreslist = new ArrayList<>();
for(int i=0;i<allDocs.length;i++){
if(!(allDocs[i]==null)){
String stringtosplit = allDocs[i];
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
stringtosplit=stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
String[] tokenizedTerms=stringtosplit.split("\\W+");
for(int j=0;j<tokenizedTerms.length;j++){
if(!(tokenizedTerms[j]==null)&&(!(tokenizedTerms[j].equalsIgnoreCase("")))){
Double tfvalue=tfCalculator(allDocs[i],tokenizedTerms[j]);
Double idfvalue=idfCalculator(allwordsList,tokenizedTerms[j],allDocs.length);
Double tfidfvalue=tfvalue*idfvalue;
if(tfidfvalue<0){negtfidf++;}
TFIDFscoreslist.add(tfvalue.doubleValue());
TFIDFscoreslistcopy.add(tfvalue.doubleValue());
wordsTFIDF.add(tokenizedTerms[j]);
if(wordTFIDFscores.get(tokenizedTerms[j])==null||wordTFIDFscores.get(tokenizedTerms[j]).doubleValue()>tfidfvalue){
wordTFIDFscores.put(tokenizedTerms[j], tfidfvalue);
}
}
}
}
}
}
}
DataManipulation shmap=new DataManipulation();
topwordsTFIDF=shmap.sortHashmap(wordTFIDFscores).subList(0, topWords);
topWordsList=topwordsTFIDF;
File file_words = new File(directory + "words.txt");
FileUtils.writeLines(file_words,topWordsList);
return topWordsList;
} catch (IOException ex) {
Logger.getLogger(TFIDF.class.getName()).log(Level.SEVERE, null, ex);
return topWordsList;
}
}
}