/*
* Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.thesmartweb.swebrank;
import java.io.File;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
/**
* Class to deal with the various functionalities related to Sensebot
* @author Administrator
*/
public class Sensebot {
/**
* Method that connects to the Sensebot url and gets the document using SAXReader
* @param link_ur the link to read from
* @return the response in a string
*/
public String connect(URL link_ur) {
try{
SAXReader reader = new SAXReader();
Document document = reader.read(link_ur);
Element root = document.getRootElement();
List<Node> content = root.content();
String stringValue="";
if (!(content.isEmpty())&&content.size()>1){
Node get = content.get(1);
stringValue = get.getStringValue();
DataManipulation tp = new DataManipulation();
stringValue=tp.removeChars(stringValue).toLowerCase();
}
return stringValue;
}catch (DocumentException ex) {
Logger.getLogger(Sensebot.class.getName()).log(Level.SEVERE, null, ex);
String output="";
return output;
}
}
/**
* Method to get the top sensebot concepts recognized for given links
* @param links the links to search for
* @param directory the directory to save the results to
* @param SensebotConcepts the amount of concepts to search for
* @param config_path the path to find sensebot's username
* @return a list with all the top sensebot concepts recognized for the given links
*/
public List<String> compute (String[] links,String directory,int SensebotConcepts, String config_path){
List<String> wordList=new ArrayList<>();
try{
URL diff_url = null;
String stringtosplit="";
String username = GetUserName(config_path);
for (String link : links) {
if (!(link == null)) {
diff_url = new URL("http://api.sensebot.net/svc/extconcone.asmx/ExtractConcepts?userName="+username+"&numConcepts="+SensebotConcepts+"&artClass=&artLength=0&Lang=English&allURLs=" + link);
stringtosplit=connect(diff_url);
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
stringtosplit=stringtosplit.replaceAll("[\\W&&[^\\s]]", "");
if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){
String[] tokenizedTerms=stringtosplit.split("\\W+"); //to get individual terms
for (String tokenizedTerm : tokenizedTerms) {
if (!(tokenizedTerm == null) && (!(tokenizedTerm.equalsIgnoreCase("")))) {
wordList.add(tokenizedTerm);
}
}
}
}
}
}
File file_words = new File(directory + "words.txt");
FileUtils.writeLines(file_words,wordList);
return wordList;
}
catch (MalformedURLException ex) {
Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex);
return wordList;
} catch (IOException ex) {
Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex);
return wordList;
}
}
/**
* Method to get the userName of sensebot
* @param config_path the path to find sensebot's username
* @return Sensebot's username
*/
public String GetUserName(String config_path){
Path input_path=Paths.get(config_path);
DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths
Collection<File> inputs_files;//array to include the paths of the txt files
inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents
List<String> tokenList = new ArrayList<>();
ReadInput ri = new ReadInput();
for (File input : inputs_files) {
if(input.getName().contains("sensebotUsername")){
tokenList=ri.readLinesConfig(input);
}
}
if(tokenList.size()>0){
return tokenList.get(0);
}
else{
String output="";
return output;
}
}
}