package com.personalityextractor.url.HTMLParser.Readability;
import java.io.BufferedReader;
import java.io.InputStreamReader;
public class Readability {
/**
* @param args
*/
String pathToPythonScript = "/Users/tejaswi/Documents/workspace/PersonalityExtraction/src/com/personalityextractor/url/HTMLParser/Readability/ReadabilityHTMLParser.py";
public String removeTags(String html) {
StringBuffer plain = new StringBuffer();
char[] chars = html.toCharArray();
for (int i = 0; i < chars.length; i++) {
if (chars[i] == '<') {
while (chars[i] != '>') {
i++;
}
plain.append(" ");
//i++;
} else{
plain.append(chars[i]);
}
}
return plain.toString();
}
public String removeHTML(String url) {
StringBuffer plainText = new StringBuffer();
try {
String[] callAndArgs = { "python", pathToPythonScript, url };
Process p = Runtime.getRuntime().exec(callAndArgs);
BufferedReader stdInput = new BufferedReader(new InputStreamReader(
p.getInputStream()));
BufferedReader stdError = new BufferedReader(new InputStreamReader(
p.getErrorStream()));
// read the output
String s = null;
while ((s = stdInput.readLine()) != null) {
plainText.append(s);
plainText.append("\n");
}
// read any errors
while ((s = stdError.readLine()) != null) {
return null;
}
} catch (Exception e) {
e.printStackTrace();
}
return removeTags(plainText.toString());
}
public static void main(String[] args) {
// HTMLParser hp = new HTMLParser();
Readability r = new Readability();
System.out.println(r.removeTags((r.removeHTML(args[0]))));
}
}