Sensebot.java example

Explorer

SWebRank-master
- src
  - main
    - java
      - com
        seomoz
        api
        authentication
        Authenticator.java
        Base64.java
        constants
        AnchorTextConstants.java
        LinksConstants.java
        TopPagesConstants.java
        URLMetricsConstants.java
        example
        Sample.java
        response
        AnchorTextResponse.java
        LinksResponse.java
        TopPagesResponse.java
        UrlResponse.java
        service
        AnchorTextService.java
        LinksService.java
        TopPagesService.java
        URLMetricsService.java
        util
        ConnectionUtil.java
        thesmartweb
        swebrank
        APIconn.java
        AnnotationClient.java
        BingResults.java
        CheckConvergence.java
        CombinationGenerator.java
        Combinations_Engine.java
        DBpediaSpotlightClient.java
        DandelionEntities.java
        DataManipulation.java
        Diffbot.java
        ElasticGetWordList.java
        GoogleResults.java
        JSONparsing.java
        LDAcall.java
        LDAsemStats.java
        LDAtopicsWords.java
        Lemmatizer.java
        LinksParseAnalysis.java
        Main.java
        Moz.java
        NWD_Analysis.java
        NWD_total.java
        PermutationGenerator.java
        PorterStemmer.java
        ReadInput.java
        Search_analysis.java
        Sensebot.java
        Sindice.java
        StHttpRequest.java
        StemmerSnow.java
        Stopwords.java
        TFIDF.java
        Total_analysis.java
        TwitterAnalysis.java
        VisibilityScore.java
        WebParser.java
        YahooConn.java
        YahooEntityCategory.java
        YahooResults.java
      - org
        tartarus
        snowball
        Among.java
        SnowballProgram.java
        SnowballStemmer.java
        TestApp.java
        ext
        englishStemmer.java

/* 
 * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.thesmartweb.swebrank;

import java.io.File;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;

/**
 * Class to deal with the various functionalities related to Sensebot
 * @author Administrator
 */
 
public class Sensebot {

    
    /**
     * Method that connects to the Sensebot url and gets the document using SAXReader
     * @param link_ur the link to read from
     * @return the response in a string
     */
    public String connect(URL link_ur) {
        try{
            SAXReader reader = new SAXReader();
            Document document = reader.read(link_ur);
            Element root = document.getRootElement();
            List<Node> content = root.content();
            String stringValue="";
            if (!(content.isEmpty())&&content.size()>1){
                Node get = content.get(1);
                stringValue = get.getStringValue();
                DataManipulation tp = new  DataManipulation();
                stringValue=tp.removeChars(stringValue).toLowerCase();
            }
            return stringValue;
        }catch (DocumentException ex) {
                Logger.getLogger(Sensebot.class.getName()).log(Level.SEVERE, null, ex);
                String output="";
                return output;
        } 
    
    }

    /**
     * Method to get the top sensebot concepts recognized for given links
     * @param links the links to search for
     * @param directory the directory to save the results to
     * @param SensebotConcepts the amount of concepts to search for
     * @param config_path the path to find sensebot's username
     * @return a list with all the top sensebot concepts recognized for the given links
     */
    public List<String> compute (String[] links,String directory,int SensebotConcepts, String config_path){
       List<String> wordList=new ArrayList<>();
       try{
           URL diff_url = null;
           String stringtosplit="";
           String username = GetUserName(config_path);
           for (String link : links) {
               if (!(link == null)) {
                   diff_url = new URL("http://api.sensebot.net/svc/extconcone.asmx/ExtractConcepts?userName="+username+"&numConcepts="+SensebotConcepts+"&artClass=&artLength=0&Lang=English&allURLs=" + link);
                   stringtosplit=connect(diff_url);
                   if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){       
                       stringtosplit=stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
                       if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
                           String[] tokenizedTerms=stringtosplit.split("\\W+");    //to get individual terms
                           for (String tokenizedTerm : tokenizedTerms) {
                               if (!(tokenizedTerm == null) && (!(tokenizedTerm.equalsIgnoreCase("")))) {
                                   wordList.add(tokenizedTerm);
                               }    
                           }
                       }
                   }
               }
           }
           File file_words = new File(directory + "words.txt");
           FileUtils.writeLines(file_words,wordList);
           return wordList;
       }
       catch (MalformedURLException ex) {
           Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex);
           return wordList;
       } catch (IOException ex) {
           Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex);
           return wordList;
       }
    }
    /**
     * Method to get the userName of sensebot
     * @param config_path the path to find sensebot's username
     * @return Sensebot's username
     */
    public String GetUserName(String config_path){
        Path input_path=Paths.get(config_path);       
        DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths
        Collection<File> inputs_files;//array to include the paths of the txt files
        inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents
        List<String> tokenList = new ArrayList<>();
        ReadInput ri = new ReadInput();
        for (File input : inputs_files) {
            if(input.getName().contains("sensebotUsername")){
                tokenList=ri.readLinesConfig(input);
            }
        }
        if(tokenList.size()>0){
            return tokenList.get(0);
        }
        else{
            String output="";
            return output;
        }
    }
}