package com.personalityextractor.entity.extractor;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import com.personalityextractor.commons.data.NounPhrase;
import com.personalityextractor.commons.data.Tweet;
public class SennaNounPhraseExtractor implements IEntityExtractor{
private static File sennaInstallDir;
static {
sennaInstallDir = new File("/home/ubuntu/senna/senna/");
}
public static String getSennaOutput(String line) {
try {
String cmd = "echo " + line + " | " + sennaInstallDir + "/senna-linux64 ";
ProcessBuilder pb = new ProcessBuilder("bash", "-c", cmd);
pb.directory(sennaInstallDir);
Process shell = pb.start();
InputStream shellIn = shell.getInputStream();
shell.waitFor();
int c;
StringBuffer s = new StringBuffer();
while ((c = shellIn.read()) != -1) {
s.append((char) c);
}
return s.toString();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
public ArrayList<String> getProperNounPhrases(String sennaOutput) {
ArrayList<String> nounPhrases = new ArrayList<String>();
String[] lineArr = sennaOutput.split("\n");
ArrayList<String> words = new ArrayList<String>();
ArrayList<String> posTags = new ArrayList<String>();
ArrayList<String> chunkerTokens = new ArrayList<String>();
try {
for (String line : lineArr) {
String[] tokens = line.trim().split("[ \t]+");
if (tokens.length < 3)
continue;
words.add(tokens[0].trim());
posTags.add(tokens[1].trim());
chunkerTokens.add(tokens[2].trim());
}
boolean flag = false;
StringBuffer npBuf= new StringBuffer();
for(int i=0; i < posTags.size(); i++){
if(posTags.get(i).startsWith("NNP")){
flag = true;
npBuf.append(words.get(i)+" ");
} else if(flag == true){
flag = false;
nounPhrases.add(npBuf.toString().trim().toLowerCase());
npBuf = new StringBuffer();
}
}
if(flag==true)
nounPhrases.add(npBuf.toString().trim().toLowerCase());
} catch (Exception e) {
e.printStackTrace();
}
return nounPhrases;
}
public ArrayList<String> getCommonNounPhrases(String sennaOutput) {
ArrayList<String> nounPhrases = new ArrayList<String>();
String[] lineArr = sennaOutput.split("\n");
ArrayList<String> words = new ArrayList<String>();
ArrayList<String> posTags = new ArrayList<String>();
ArrayList<String> chunkerTokens = new ArrayList<String>();
try {
for (String line : lineArr) {
String[] tokens = line.trim().split("[ \t]+");
if (tokens.length < 3)
continue;
words.add(tokens[0].trim());
posTags.add(tokens[1].trim());
chunkerTokens.add(tokens[2].trim());
}
boolean flag = false;
StringBuffer npBuf= new StringBuffer();
for(int i=0; i < posTags.size(); i++){
if(posTags.get(i).startsWith("NN") && !posTags.get(i).startsWith("NNP")) {
flag = true;
npBuf.append(words.get(i)+" ");
} else if(flag == true){
flag = false;
nounPhrases.add(npBuf.toString().trim().toLowerCase());
npBuf = new StringBuffer();
}
}
if(flag==true)
nounPhrases.add(npBuf.toString().trim().toLowerCase());
} catch (Exception e) {
e.printStackTrace();
}
return nounPhrases;
}
public ArrayList<NounPhrase> getNounPhrasesWithType(String sennaOutput){
ArrayList<NounPhrase> nounPhrases = new ArrayList<NounPhrase>();
String[] lineArr = sennaOutput.split("\n");
ArrayList<String> words = new ArrayList<String>();
ArrayList<String> posTags = new ArrayList<String>();
ArrayList<String> chunkerTokens = new ArrayList<String>();
try {
for (String line : lineArr) {
String[] tokens = line.trim().split("[ \t]+");
if (tokens.length < 3)
continue;
words.add(tokens[0].trim());
posTags.add(tokens[1].trim());
chunkerTokens.add(tokens[2].trim());
}
boolean flag = false;
StringBuffer npBuf= new StringBuffer();
// get Proper Noun Phrases
for(int i=0; i < posTags.size(); i++){
if(posTags.get(i).startsWith("NNP")){
flag = true;
npBuf.append(words.get(i)+" ");
} else if(flag == true){
flag = false;
nounPhrases.add(new NounPhrase(npBuf.toString().trim().toLowerCase(), "PN"));
npBuf = new StringBuffer();
}
}
if(flag==true)
nounPhrases.add(new com.personalityextractor.commons.data.NounPhrase(npBuf.toString().trim().toLowerCase(), "PN"));
//re-initalize variables
flag = false;
npBuf= new StringBuffer();
//get Common Noun Phrases
for(int i=0; i < posTags.size(); i++){
if(posTags.get(i).startsWith("NN") && !posTags.get(i).startsWith("NNP")) {
flag = true;
npBuf.append(words.get(i)+" ");
} else if(flag == true){
flag = false;
nounPhrases.add(new NounPhrase(npBuf.toString().trim().toLowerCase(), "CN"));
npBuf = new StringBuffer();
}
}
if(flag==true)
nounPhrases.add(new NounPhrase(npBuf.toString().trim().toLowerCase(), "CN"));
} catch (Exception e) {
//e.printStackTrace();
}
return nounPhrases;
}
@Override
public List<String> extract(String line) {
List<String> entities = new ArrayList<String>();
String sennaOutput = getSennaOutput(line);
//System.out.println(sennaOutput);
List<String> nps = getProperNounPhrases(sennaOutput);
for(String np : nps){
entities.add(np);
// if(np.getType().equalsIgnoreCase("pn")){
// entities.add(np.getText());
// } else if (np.getType().equalsIgnoreCase("cn")){
// for(String token : np.getText().split("\\s+")){
// entities.add(token);
// }
// }
}
return entities;
}
public List<String> readLinesinFile(String file){
List<String> lines = new ArrayList<String>();
try{
BufferedReader br = new BufferedReader(new FileReader(file));
String line ="";
while((line=br.readLine())!=null){
lines.add(line);
}
}catch(Exception e){
e.printStackTrace();
}
return lines;
}
public static void main(String[] args){
SennaNounPhraseExtractor sn = new SennaNounPhraseExtractor();
String line1 = "I am near the Berghoff at Restaurant";
List<String> lines = new ArrayList<String>();
//List<String> lines = sn.readLinesinFile(args[0]);
lines.add(line1);
for(String line : lines){
System.out.println("Tweet: "+ line);
Tweet t = new Tweet(line);
for(String sent: t.getSentences()) {
System.out.println("Sentence: "+sent);
String sennaout = getSennaOutput(sent);
System.out.println(sn.getCommonNounPhrases(sennaout));
}
}
}
}