ReadInput.java example

Explorer

SWebRank-master
- src
  - main
    - java
      - com
        seomoz
        api
        authentication
        Authenticator.java
        Base64.java
        constants
        AnchorTextConstants.java
        LinksConstants.java
        TopPagesConstants.java
        URLMetricsConstants.java
        example
        Sample.java
        response
        AnchorTextResponse.java
        LinksResponse.java
        TopPagesResponse.java
        UrlResponse.java
        service
        AnchorTextService.java
        LinksService.java
        TopPagesService.java
        URLMetricsService.java
        util
        ConnectionUtil.java
        thesmartweb
        swebrank
        APIconn.java
        AnnotationClient.java
        BingResults.java
        CheckConvergence.java
        CombinationGenerator.java
        Combinations_Engine.java
        DBpediaSpotlightClient.java
        DandelionEntities.java
        DataManipulation.java
        Diffbot.java
        ElasticGetWordList.java
        GoogleResults.java
        JSONparsing.java
        LDAcall.java
        LDAsemStats.java
        LDAtopicsWords.java
        Lemmatizer.java
        LinksParseAnalysis.java
        Main.java
        Moz.java
        NWD_Analysis.java
        NWD_total.java
        PermutationGenerator.java
        PorterStemmer.java
        ReadInput.java
        Search_analysis.java
        Sensebot.java
        Sindice.java
        StHttpRequest.java
        StemmerSnow.java
        Stopwords.java
        TFIDF.java
        Total_analysis.java
        TwitterAnalysis.java
        VisibilityScore.java
        WebParser.java
        YahooConn.java
        YahooEntityCategory.java
        YahooResults.java
      - org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        englishStemmer.java

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

/**
 * 
 *
 * @author Themis Mavridis
 */
import java.io.*;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.*;
import java.util.List;
import java.util.Scanner;

/**
 * Class to read the input files
 * @author themis
 */
public class ReadInput {
           
    /**
     * the number of results that are returned from each search engine
     */
    protected int results_number;
            
    /**
     * moz threshold
     */
    protected Double moz_threshold;
 
    /**
     * if we want to have a threshold in moz or not
     */
    protected boolean moz_threshold_option; 

    /**
     * amount of Sensebot Concept (if we choose to use Sensebot)
     */
    protected int SensebotConcepts;

    /**
     * List that has true 0 if we would like to use Moz
     * #1 Domain Authority
     * #2 External MozRank
     * #3 MozRank
     * #4 MozTrust
     * #5 Page Authority
     * #6 Subdomain MozRank
     */
    protected List<Boolean> mozMetrics;

    /**
     * List that contains the queries that are defined in the input files
     */
    protected List<String> queries;

    /**
     * Search engine choice, Bing is in 1st place, Google is in 2nd place, Yahoo 3rd
     */
    protected List<Boolean> enginechoice; 

    /**
     * List that contains which Semantic Analysis algorithm we choose
     * #1 Diffbotflag
     * #2 LDA
     * #3 Sensebotflag
     * #4 TF-IDF
     */
    protected List<Boolean> ContentSemantics;

    /**
     * List that contains then SWebRank settings
     * #0 beta
     * #1 number of topics
     * #2 number of iterations
     * #3 number of top words
     * #4 probability 
     * #5 nmi convergence limit
     * #6 nwd threshold
     * #7 combine limit
     * #8 performance limit
     * #9 new terms to combine from wordlist per query per round
     * #10 max new queries to generate per previous round query
     */
    protected List<Double> SWebRankSettings;

    /**
     * Domain of queries
     */
    
    protected String domain;
    
    
    /**
     * Initialize the values
     */
    
    public ReadInput() {
        this.SensebotConcepts = 0;
        this.moz_threshold_option = false;
        this.moz_threshold = 0.0;
        this.results_number = 0;
        this.queries = new LinkedList<String>();//better in add(E element)
        this.enginechoice= new ArrayList<Boolean>();//lower complexity in get
        this.ContentSemantics=new ArrayList<Boolean>();
        this.SWebRankSettings=new ArrayList<Double>();
        this.mozMetrics=new ArrayList<Boolean>();
        this.domain="";
        
    }

    /**
     * Method to read SWebRank's settings
     * @param Input The file that contains the input settings to be read
     * @return True/False if everything was read correctly
     */
    public boolean perform(File Input){
    
        FileInputStream inputStream=null;
        Scanner sc=null;
        try{
            inputStream=new FileInputStream(Input);
            sc=new Scanner(inputStream);
            if (sc.hasNextLine()) {
                domain = sc.nextLine().toString().split(":")[1].trim();
            }
            if (sc.hasNextLine()) {
                int queries_number = Integer.parseInt(sc.nextLine().toString().split(":")[1].trim());
                int j=0;
                while(j<queries_number){
                    String temp=sc.nextLine().toString().split(":")[1].trim();
                    boolean add = queries.add(temp);
                    j++;
                }
            }

            if (sc.hasNextLine()) {
                results_number = Integer.parseInt(sc.nextLine().toString().split(":")[1].trim());
            }
            //-------------------
            if (sc.hasNextLine()) {//Bing
                enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//Google
                enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//Yahoo
                enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//Merged Engine Results
                enginechoice.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            //--------------------
            if (sc.hasNextLine()) {//0moz
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//1Domain Authority
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//2External MozRank
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//3MozRank
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//4MozTrust
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//5Page Authority
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//6Subdomain_MozRank
                mozMetrics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {
                moz_threshold_option = Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim());
            }
            //------if we are going to use the threshold then we insert the value to the moz_threshold variable
            //------otherwise, we just skip it
            if (sc.hasNextLine()) {
                if(moz_threshold_option){
                    moz_threshold = Double.parseDouble(sc.nextLine().toString().split(":")[1].trim());
                }
                else{
                    sc.nextLine();
                    moz_threshold=-1.0;
                }
            }
            //-----the following is used to check if we have one and only one Moz option active
            if(mozMetrics.get(0)){
                int k=1;
                int true_position=-1;
                //we search for the option set to true
                while(true_position<0&&k<mozMetrics.size()){
                    if(mozMetrics.get(k)){
                        true_position=k;
                    }
                    k++;
                }
                //if the user has set multiple options set true, we are keeping the first one
                if(true_position>0){
                    int mozMetricsSize=mozMetrics.size();
                    for(k=true_position+1;k<mozMetricsSize;k++){
                        mozMetrics.set(k,false);
                    } 
                }
                //if the user has not set any option to true, we are not going to use Moz
                else{
                    mozMetrics.set(0, false);
                }
            }
            //-----------------------------
            if (sc.hasNextLine()) {//0Diffbotflag
                ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//1LDAflag
                ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//2Sensebotflag
                ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//3TFIDFflag
                ContentSemantics.add(Boolean.parseBoolean(sc.nextLine().toString().split(":")[1].trim()));
            }
            if(ContentSemantics.get(2).booleanValue()){
                if (sc.hasNextLine()) {
                    SensebotConcepts=Integer.parseInt(sc.nextLine().toString().split(":")[1].trim());
                }
            }
            else {
                sc.nextLine();
                SensebotConcepts=0;
            }
            
            //-----------------------------------
            //-----the following is used to check if we have one and only one Content Semantic option active
            int k=1;
            int true_position=-1;
            while(true_position<0&&k<ContentSemantics.size()){
                if(ContentSemantics.get(k)){
                    true_position=k;
                }
                k++;
            }
            if(true_position>0){
                int contentSemanticsSize=ContentSemantics.size();
                for(k=true_position+1;k<contentSemanticsSize;k++){
                    ContentSemantics.set(k,false);
                } 
            }
            //if the user has not set any option to true, we are not going to use LDA
            else{
                ContentSemantics.set(1,true);
            }
            //-----------------------------
            if (sc.hasNextLine()) {//0beta
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//1number of topics
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//2number of iterations
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//3number of top words
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//4number of probability threshold
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//5conversion limit
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//6ngd threshold
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//7combine limit
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//8performance limit
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//9amount of terms to get from each query from the wordlist of LDA to create the new queries per round
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//10amount of queries to create for each query using NWD
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//11amount of top topics to choose
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//12dbpedia spotlight confidence
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.hasNextLine()) {//13dbpedia spotlight support
                SWebRankSettings.add(Double.parseDouble(sc.nextLine().toString().split(":")[1].trim()));
            }
            if (sc.ioException() !=null){
                return false;
            } 
        } catch (IOException ex) {
            Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex);
            return false;
        } finally {
            if (inputStream !=null){
                try {
                    inputStream.close();
                } catch (IOException ex) {
                    Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex);
                    return false;
                }
            }
            if (sc !=null){
                sc.close();
                return true;
            }
        }
        return false;
    }
    /**
     * Method to get the api credentials in a list from a directory that contains multiple txt files
     * @param config_path the directory to read
     * @param name the name of the file that we would like to read
     * @return a list with the credentials
     */
     public List<String> GetKeyFile(String config_path,String name){
        Path input_path=Paths.get(config_path);       
        DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths
        Collection<File> inputs_files;//array to include the paths of the txt files
        inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents
        List<String> linesList = new ArrayList<>();
        ReadInput ri = new ReadInput();
        for (File input : inputs_files) {
            if(input.getName().contains(name)){
                linesList=ri.readLinesConfig(input);
            }
        }
        return linesList;
    }
     /**
      * Get the API credentials from a given file
      * @param Input the file to read
      * @return a List with the credentials in strings
      */
    public List<String> readLinesConfig(File Input){
    
        FileInputStream inputStream=null;
        Scanner sc=null;
        List<String> output = new ArrayList<>();
        try{
            inputStream=new FileInputStream(Input);
            sc=new Scanner(inputStream);
            while (sc.hasNextLine()) {
                output.add(sc.nextLine().trim());
            }
        } catch (IOException ex) {
            Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex);
            return output;
        } finally {
            if (inputStream !=null){
                try {
                    inputStream.close();
                } catch (IOException ex) {
                    Logger.getLogger(ReadInput.class.getName()).log(Level.SEVERE, null, ex);
                    return output;
                }
            }
            if (sc !=null){
                sc.close();
            }
        }
        return output;
    }
   

}