package es.upm.fi.dia.oeg.map4rdf.server.servlet;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.google.inject.Singleton;
@Singleton
public class ParseWikipediaService extends HttpServlet{
private static final long serialVersionUID = -8524195705285261839L;
private static final String WIKIPEDIA_PARAM="URL";
@SuppressWarnings("static-access")
@Override
protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
try {
resp.setContentType("text/html; charset=UTF-8");
String URL = getWikipediaURL(req);
String result="";
if(URL.isEmpty() || !URL.contains("wikipedia")){
result="You need to specified URL parameter with wikipedia URL.";
printString(result, resp);
resp.getOutputStream().close();
return;
}
if(!URL.contains("http://")){
URL="https://"+URL;
}
if(URL.contains("http://")){
URL=URL.replace("http://", "https://");
}
try {
final URL wikipediaURL = new URL(URL);
final String host=wikipediaURL.getHost();
final HttpURLConnection wikipediaCon = (HttpURLConnection)wikipediaURL.openConnection();
wikipediaCon.setFollowRedirects(true);
wikipediaCon.addRequestProperty("Content-Type", "text/plain; charset=utf-8");
wikipediaCon.setRequestProperty("Content-Type", "text/plain; charset=utf-8");
wikipediaCon.setRequestProperty("content-type", "text/plain; charset=utf-8");
wikipediaCon.connect();
BufferedReader buffReader = new BufferedReader(
new InputStreamReader(wikipediaCon.getInputStream(),"UTF-8"));
String toReturn=htmlParseWikipediaInfobox(buffReader,host);
if(toReturn==null){
final URLConnection wikipediaConDescription = wikipediaURL.openConnection();
final BufferedReader buffReaderDescription = new BufferedReader(
new InputStreamReader(wikipediaConDescription.getInputStream(),"UTF-8"));
toReturn=htmlParseWikipediaFirtsDescription(buffReaderDescription,host);
}
printString(toReturn, resp);
resp.getOutputStream().close();
return;
} catch (final MalformedURLException e) {
e.printStackTrace();
throw new ServletException(e);
} catch (final IOException e) {
e.printStackTrace();
throw new ServletException(e);
}
} catch (Exception e) {
throw new ServletException(e);
}
}
private String getWikipediaURL(HttpServletRequest req){
String wikipediaURL=req.getParameterValues(WIKIPEDIA_PARAM)[0];
return wikipediaURL;
}
private void printString(String toPrint,HttpServletResponse resp) throws IOException{
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(resp.getOutputStream(),"UTF-8"));
writer.append(toPrint);
writer.flush();
}
private String htmlParseWikipediaFirtsDescription(
BufferedReader buffReader, String host) {
String result="";
boolean finalHead=false;
boolean foundFirtsDescription=false;
boolean foundFirtsP=false;
boolean finish=false;
try {
String inputLine = buffReader.readLine();
while (inputLine != null && !finish) {
//System.out.println(inputLine);
if(inputLine.contains("</head")){
finalHead = true;
result+=inputLine;
result+="<body>";
}
if(finalHead && inputLine.contains("mw-content-text")){
foundFirtsDescription=true;
}
if(foundFirtsDescription && inputLine.contains("<p")){
foundFirtsP=true;
}
if(!finalHead || foundFirtsP){
if(inputLine.contains("href=/")){
inputLine=inputLine.replace("href=/", "target=\"_blank\" href=\"http://"+host+"/");
}
if(inputLine.contains("href=\"/")){
inputLine=inputLine.replace("href=\"/", "target=\"_blank\" href=\"http://"+host+"/");
}
result+=inputLine;
}
if(inputLine.contains("id=\"toc\"")){
finish=true;
}
inputLine = buffReader.readLine();
}
if(!foundFirtsP){
return "Not found infobox or description";
}
result+="</body></html>";
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
buffReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
private String htmlParseWikipediaInfobox(BufferedReader buffReader, String host){
String result="";
boolean finalHead=false;
boolean foundInfobox=false;
boolean finish=false;
int countTables=-1;
try {
boolean firtsExecution = true;
String inputLine = buffReader.readLine();
if(firtsExecution){
while(!buffReader.ready()){}
inputLine = buffReader.readLine();
firtsExecution = false;
}
String inputLineContains = "";
while (inputLine != null && !finish) {
inputLineContains = "";
if(inputLine !=null ){
inputLineContains = inputLine.toLowerCase();
}
if(inputLineContains.contains("</head")){
finalHead = true;
result+=inputLine;
result+="<body>";
}
if(finalHead && (inputLineContains.contains("infobox_v2") || inputLineContains.contains("infobox"))){
foundInfobox=true;
}
if(!finalHead || foundInfobox){
if((inputLineContains.contains("infobox_v2") || inputLineContains.contains("infobox"))){
result+="<table style=\"width:15px; text-align:left;\">";
}else{
if(finalHead){
if(inputLine.contains("href=/")){
inputLine=inputLine.replace("href=/", "target=\"_blank\" href=\"http://"+host+"/");
}
if(inputLine.contains("href=\"/")){
inputLine=inputLine.replace("href=\"/", "target=\"_blank\" href=\"http://"+host+"/");
}
}
result+=inputLine;
}
if(foundInfobox && inputLine.contains("<table")){
countTables++;
}
if(inputLineContains.contains("</table")){
if(countTables==0){
finish=true;
}else{
countTables--;
}
}
}
inputLine = buffReader.readLine();
}
if(!foundInfobox){
return null;
}
result+="</body></html>";
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
buffReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
}