/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Class to get stats related to the content generated by LDA
* @author Themistoklis Mavridis
*/
public class LDAsemStats {
private int lda_top_words_parsed=0;//the amount of top words that were included in the content parsed in a specific url
private int ent_cnt=0;//the amount of entities that were included in the top words generated by LDA
private int cat_cnt=0;//the amount of categories that were included in the top words generated by LDA
private double lda_top_words_parsed_percentage=0.0;//the percentage of top words of LDA against the total length of the parsed Content
/**
* Method to get various stats regarding the existence of semantic entities, categories in lda's output
* @param Entities list of entities
* @param Categories list of categories
* @param lda_output lda's output
* @param StemFlag flag to use stemming
*/
public void getEntCatStats(List<String> Entities,List<String> Categories,List<String> lda_output, boolean StemFlag){
ent_cnt=0;
cat_cnt=0;
if(Entities!=null&&Categories!=null){
List<String> splitEntitiesList = new ArrayList<>();
String[] splitEntities;
for(String entity:Entities){
String[] splitEntity = entity.split(" ");
for(String s:splitEntity){
splitEntitiesList.add(s);
}
}
splitEntities = splitEntitiesList.toArray(new String[splitEntitiesList.size()]);
List<String> splitCategoriesList = new ArrayList<>();
String[] splitCategories;
for(String category:Categories){
String[] splitCategory = category.split(" ");
for(String s:splitCategory){
splitCategoriesList.add(s);
}
}
splitCategories = splitCategoriesList.toArray(new String[splitCategoriesList.size()]);
if(StemFlag){
StemmerSnow stemmer = new StemmerSnow();
lda_output = stemmer.stem(lda_output);
splitEntitiesList = stemmer.stem(splitEntitiesList);
splitEntities= splitEntitiesList.toArray(new String[splitEntitiesList.size()]);
splitCategoriesList = stemmer.stem(splitCategoriesList);
splitCategories= splitCategoriesList.toArray(new String[splitCategoriesList.size()]);
}
for(String s:lda_output){
for(String splitStr:splitEntities){
if(s.equalsIgnoreCase(splitStr)){
ent_cnt++;
}
}
for(String splitStr:splitCategories){
if(s.equalsIgnoreCase(splitStr)){
cat_cnt++;
}
}
}
}
}
/**
* Method to get stats comparing lda's output and the parsed content of a url
* @param parsedContent the parsed content of a specifc web documents
* @param lda_output lda's output
* @param StemFlag flag to use stemming or not
*/
public void getTopWordsStats(String parsedContent,List<String> lda_output, boolean StemFlag){
lda_top_words_parsed=0;
lda_top_words_parsed_percentage=0.0;
if(!parsedContent.isEmpty()){
String[] parsedContentsplit = parsedContent.split(" ");
if(StemFlag){
List<String> parsedContentsplitList = Arrays.asList(parsedContentsplit);
StemmerSnow stemmer = new StemmerSnow();
parsedContentsplitList = stemmer.stem(parsedContentsplitList);
parsedContentsplit= parsedContentsplitList.toArray(new String[parsedContentsplitList.size()]);
lda_output = stemmer.stem(lda_output);
}
for(String s:lda_output){
for(String splitStr:parsedContentsplit){
if(s.equalsIgnoreCase(splitStr)){
lda_top_words_parsed++;
}
}
}
lda_top_words_parsed_percentage = lda_top_words_parsed / (double) parsedContentsplit.length;
}
}
/**
* Method to get the amount of top words that were included in the content parsed in a specific url
* @return the amount of top words that were included in the content parsed in a specific url
*/
public int getTopStats(){return lda_top_words_parsed;}
/**
* Method to return the percentage of top words of LDA against the total length of the parsed Content
* @return the percentage of top words of LDA against the total length of the parsed Content
*/
public double getTopPercentageStats(){return lda_top_words_parsed_percentage;}
/**
* Method to return the amount of entities that were included in the top words generated by LDA
* @return the amount of entities that were included in the top words generated by LDA
*/
public int getEntStats(){return ent_cnt;}
/**
* the amount of categories that were included in the top words generated by LDA
* @return the amount of categories that were included in the top words generated by LDA
*/
public int getCategoryStats(){return cat_cnt;}
}