/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Class to get stats related to the content generated by LDA * @author Themistoklis Mavridis */ public class LDAsemStats { private int lda_top_words_parsed=0;//the amount of top words that were included in the content parsed in a specific url private int ent_cnt=0;//the amount of entities that were included in the top words generated by LDA private int cat_cnt=0;//the amount of categories that were included in the top words generated by LDA private double lda_top_words_parsed_percentage=0.0;//the percentage of top words of LDA against the total length of the parsed Content /** * Method to get various stats regarding the existence of semantic entities, categories in lda's output * @param Entities list of entities * @param Categories list of categories * @param lda_output lda's output * @param StemFlag flag to use stemming */ public void getEntCatStats(List<String> Entities,List<String> Categories,List<String> lda_output, boolean StemFlag){ ent_cnt=0; cat_cnt=0; if(Entities!=null&&Categories!=null){ List<String> splitEntitiesList = new ArrayList<>(); String[] splitEntities; for(String entity:Entities){ String[] splitEntity = entity.split(" "); for(String s:splitEntity){ splitEntitiesList.add(s); } } splitEntities = splitEntitiesList.toArray(new String[splitEntitiesList.size()]); List<String> splitCategoriesList = new ArrayList<>(); String[] splitCategories; for(String category:Categories){ String[] splitCategory = category.split(" "); for(String s:splitCategory){ splitCategoriesList.add(s); } } splitCategories = splitCategoriesList.toArray(new String[splitCategoriesList.size()]); if(StemFlag){ StemmerSnow stemmer = new StemmerSnow(); lda_output = stemmer.stem(lda_output); splitEntitiesList = stemmer.stem(splitEntitiesList); splitEntities= splitEntitiesList.toArray(new String[splitEntitiesList.size()]); splitCategoriesList = stemmer.stem(splitCategoriesList); splitCategories= splitCategoriesList.toArray(new String[splitCategoriesList.size()]); } for(String s:lda_output){ for(String splitStr:splitEntities){ if(s.equalsIgnoreCase(splitStr)){ ent_cnt++; } } for(String splitStr:splitCategories){ if(s.equalsIgnoreCase(splitStr)){ cat_cnt++; } } } } } /** * Method to get stats comparing lda's output and the parsed content of a url * @param parsedContent the parsed content of a specifc web documents * @param lda_output lda's output * @param StemFlag flag to use stemming or not */ public void getTopWordsStats(String parsedContent,List<String> lda_output, boolean StemFlag){ lda_top_words_parsed=0; lda_top_words_parsed_percentage=0.0; if(!parsedContent.isEmpty()){ String[] parsedContentsplit = parsedContent.split(" "); if(StemFlag){ List<String> parsedContentsplitList = Arrays.asList(parsedContentsplit); StemmerSnow stemmer = new StemmerSnow(); parsedContentsplitList = stemmer.stem(parsedContentsplitList); parsedContentsplit= parsedContentsplitList.toArray(new String[parsedContentsplitList.size()]); lda_output = stemmer.stem(lda_output); } for(String s:lda_output){ for(String splitStr:parsedContentsplit){ if(s.equalsIgnoreCase(splitStr)){ lda_top_words_parsed++; } } } lda_top_words_parsed_percentage = lda_top_words_parsed / (double) parsedContentsplit.length; } } /** * Method to get the amount of top words that were included in the content parsed in a specific url * @return the amount of top words that were included in the content parsed in a specific url */ public int getTopStats(){return lda_top_words_parsed;} /** * Method to return the percentage of top words of LDA against the total length of the parsed Content * @return the percentage of top words of LDA against the total length of the parsed Content */ public double getTopPercentageStats(){return lda_top_words_parsed_percentage;} /** * Method to return the amount of entities that were included in the top words generated by LDA * @return the amount of entities that were included in the top words generated by LDA */ public int getEntStats(){return ent_cnt;} /** * the amount of categories that were included in the top words generated by LDA * @return the amount of categories that were included in the top words generated by LDA */ public int getCategoryStats(){return cat_cnt;} }