/* * Copyright 2015 Themistoklis Mavridis <themis.mavridis@issel.ee.auth.gr>. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.thesmartweb.swebrank; import java.io.File; import java.io.IOException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collection; import java.util.logging.Level; import java.util.logging.Logger; import java.util.List; import org.apache.commons.io.FileUtils; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.Node; import org.dom4j.io.SAXReader; /** * Class to deal with the various functionalities related to Sensebot * @author Administrator */ public class Sensebot { /** * Method that connects to the Sensebot url and gets the document using SAXReader * @param link_ur the link to read from * @return the response in a string */ public String connect(URL link_ur) { try{ SAXReader reader = new SAXReader(); Document document = reader.read(link_ur); Element root = document.getRootElement(); List<Node> content = root.content(); String stringValue=""; if (!(content.isEmpty())&&content.size()>1){ Node get = content.get(1); stringValue = get.getStringValue(); DataManipulation tp = new DataManipulation(); stringValue=tp.removeChars(stringValue).toLowerCase(); } return stringValue; }catch (DocumentException ex) { Logger.getLogger(Sensebot.class.getName()).log(Level.SEVERE, null, ex); String output=""; return output; } } /** * Method to get the top sensebot concepts recognized for given links * @param links the links to search for * @param directory the directory to save the results to * @param SensebotConcepts the amount of concepts to search for * @param config_path the path to find sensebot's username * @return a list with all the top sensebot concepts recognized for the given links */ public List<String> compute (String[] links,String directory,int SensebotConcepts, String config_path){ List<String> wordList=new ArrayList<>(); try{ URL diff_url = null; String stringtosplit=""; String username = GetUserName(config_path); for (String link : links) { if (!(link == null)) { diff_url = new URL("http://api.sensebot.net/svc/extconcone.asmx/ExtractConcepts?userName="+username+"&numConcepts="+SensebotConcepts+"&artClass=&artLength=0&Lang=English&allURLs=" + link); stringtosplit=connect(diff_url); if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){ stringtosplit=stringtosplit.replaceAll("[\\W&&[^\\s]]", ""); if(!(stringtosplit==null)&&(!(stringtosplit.equalsIgnoreCase("")))){ String[] tokenizedTerms=stringtosplit.split("\\W+"); //to get individual terms for (String tokenizedTerm : tokenizedTerms) { if (!(tokenizedTerm == null) && (!(tokenizedTerm.equalsIgnoreCase("")))) { wordList.add(tokenizedTerm); } } } } } } File file_words = new File(directory + "words.txt"); FileUtils.writeLines(file_words,wordList); return wordList; } catch (MalformedURLException ex) { Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex); return wordList; } catch (IOException ex) { Logger.getLogger(Diffbot.class.getName()).log(Level.SEVERE, null, ex); return wordList; } } /** * Method to get the userName of sensebot * @param config_path the path to find sensebot's username * @return Sensebot's username */ public String GetUserName(String config_path){ Path input_path=Paths.get(config_path); DataManipulation getfiles=new DataManipulation();//class responsible for the extraction of paths Collection<File> inputs_files;//array to include the paths of the txt files inputs_files=getfiles.getinputfiles(input_path.toString(),"txt");//method to retrieve all the path of the input documents List<String> tokenList = new ArrayList<>(); ReadInput ri = new ReadInput(); for (File input : inputs_files) { if(input.getName().contains("sensebotUsername")){ tokenList=ri.readLinesConfig(input); } } if(tokenList.size()>0){ return tokenList.get(0); } else{ String output=""; return output; } } }