LDAsemStats.java example

Explorer

SWebRank-master
- src
  - main
    - java
      - com
        seomoz
        api
        authentication
        Authenticator.java
        Base64.java
        constants
        AnchorTextConstants.java
        LinksConstants.java
        TopPagesConstants.java
        URLMetricsConstants.java
        example
        Sample.java
        response
        AnchorTextResponse.java
        LinksResponse.java
        TopPagesResponse.java
        UrlResponse.java
        service
        AnchorTextService.java
        LinksService.java
        TopPagesService.java
        URLMetricsService.java
        util
        ConnectionUtil.java
        thesmartweb
        swebrank
        APIconn.java
        AnnotationClient.java
        BingResults.java
        CheckConvergence.java
        CombinationGenerator.java
        Combinations_Engine.java
        DBpediaSpotlightClient.java
        DandelionEntities.java
        DataManipulation.java
        Diffbot.java
        ElasticGetWordList.java
        GoogleResults.java
        JSONparsing.java
        LDAcall.java
        LDAsemStats.java
        LDAtopicsWords.java
        Lemmatizer.java
        LinksParseAnalysis.java
        Main.java
        Moz.java
        NWD_Analysis.java
        NWD_total.java
        PermutationGenerator.java
        PorterStemmer.java
        ReadInput.java
        Search_analysis.java
        Sensebot.java
        Sindice.java
        StHttpRequest.java
        StemmerSnow.java
        Stopwords.java
        TFIDF.java
        Total_analysis.java
        TwitterAnalysis.java
        VisibilityScore.java
        WebParser.java
        YahooConn.java
        YahooEntityCategory.java
        YahooResults.java
      - org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        englishStemmer.java

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
 * Class to get stats related to the content generated by LDA
 * @author Themistoklis Mavridis
 */
public class LDAsemStats {
    private int lda_top_words_parsed=0;//the amount of top words that were included in the content parsed in a specific url
    private int ent_cnt=0;//the amount of entities that were included in the top words generated by LDA
    private int cat_cnt=0;//the amount of categories that were included in the top words generated by LDA
    private double lda_top_words_parsed_percentage=0.0;//the percentage of top words of LDA against the total length of the parsed Content
    /**
     * Method to get various stats regarding the existence of semantic entities, categories in lda's  output
     * @param Entities list of entities
     * @param Categories list of categories
     * @param lda_output lda's output
     * @param StemFlag flag to use stemming
     */
    public void getEntCatStats(List<String> Entities,List<String> Categories,List<String> lda_output, boolean StemFlag){
        ent_cnt=0;
        cat_cnt=0;
        if(Entities!=null&&Categories!=null){
            List<String> splitEntitiesList = new ArrayList<>();
            String[] splitEntities;
            for(String entity:Entities){
                String[] splitEntity = entity.split(" ");
                for(String s:splitEntity){
                    splitEntitiesList.add(s);
                }
            }
            splitEntities = splitEntitiesList.toArray(new String[splitEntitiesList.size()]);


            List<String> splitCategoriesList = new ArrayList<>();
            String[] splitCategories;
            for(String category:Categories){
                String[] splitCategory = category.split(" ");
                for(String s:splitCategory){
                    splitCategoriesList.add(s);
                }
            }
            splitCategories = splitCategoriesList.toArray(new String[splitCategoriesList.size()]);
            if(StemFlag){
                StemmerSnow stemmer = new StemmerSnow();
                lda_output = stemmer.stem(lda_output);
                splitEntitiesList = stemmer.stem(splitEntitiesList);
                splitEntities= splitEntitiesList.toArray(new String[splitEntitiesList.size()]);
                splitCategoriesList = stemmer.stem(splitCategoriesList);
                splitCategories= splitCategoriesList.toArray(new String[splitCategoriesList.size()]);
            }
            for(String s:lda_output){
                for(String splitStr:splitEntities){
                    if(s.equalsIgnoreCase(splitStr)){
                        ent_cnt++;
                    }
                }
                for(String splitStr:splitCategories){
                    if(s.equalsIgnoreCase(splitStr)){
                        cat_cnt++;
                    }
                }
            }
        }
        
    }
    /**
     * Method to get stats comparing lda's output and the parsed content of a url
     * @param parsedContent the parsed content of a specifc web documents
     * @param lda_output lda's output
     * @param StemFlag flag to use stemming or not
     */
    public void getTopWordsStats(String parsedContent,List<String> lda_output, boolean StemFlag){
        lda_top_words_parsed=0;
        lda_top_words_parsed_percentage=0.0;
        if(!parsedContent.isEmpty()){
            String[] parsedContentsplit = parsedContent.split(" ");
            if(StemFlag){
                List<String> parsedContentsplitList = Arrays.asList(parsedContentsplit);
                StemmerSnow stemmer = new StemmerSnow();
                parsedContentsplitList = stemmer.stem(parsedContentsplitList);
                parsedContentsplit= parsedContentsplitList.toArray(new String[parsedContentsplitList.size()]);
                lda_output = stemmer.stem(lda_output);                
            }
            for(String s:lda_output){
                for(String splitStr:parsedContentsplit){
                    if(s.equalsIgnoreCase(splitStr)){
                        lda_top_words_parsed++;
                    }
                }
            }
            lda_top_words_parsed_percentage = lda_top_words_parsed / (double) parsedContentsplit.length;
        }
        
    }
    /**
     * Method to get the amount of top words that were included in the content parsed in a specific url
     * @return the amount of top words that were included in the content parsed in a specific url
     */
    
    public int getTopStats(){return lda_top_words_parsed;}
    /**
     * Method to return the percentage of top words of LDA against the total length of the parsed Content
     * @return the percentage of top words of LDA against the total length of the parsed Content
     */
    public double getTopPercentageStats(){return lda_top_words_parsed_percentage;}
    /**
     * Method to return the amount of entities that were included in the top words generated by LDA
     * @return the amount of entities that were included in the top words generated by LDA
     */
    public int getEntStats(){return ent_cnt;}
    /**
     * the amount of categories that were included in the top words generated by LDA
     * @return the amount of categories that were included in the top words generated by LDA
     */
    public int getCategoryStats(){return cat_cnt;}
}