package com.cognitionis.nlp_files.parentical_parsers;
/**
*
* @author Héctor Llorens
* @since 2011
*/
import java.util.*;
public class SyntColParser {
private String full;
private int parlevel;
private int tokens_parsed;
private String currentMainPhrase;
private int currentPositionInMainPhrase;
private Integer [] currentMainPhraseSpan;
private int currentMainPhrase_parlevel;
private int sentnum;
private String currentSubsent;
private int currentSubsent_parlevel;
private Stack st;
private ArrayList<Integer []> mainphrases_span;
public SyntColParser() {
full = "";
parlevel = 0;
tokens_parsed=0;
currentMainPhrase="O";
currentSubsent="O";
currentPositionInMainPhrase=0;
currentMainPhrase_parlevel=0;
st = new Stack();
currentMainPhraseSpan=new Integer [2];
currentMainPhraseSpan[0]=0;
currentMainPhraseSpan[1]=0;
mainphrases_span=new ArrayList<Integer []>();
}
public void parse(String input) {
try {
String linput = input.trim();
String element = "";
if(!currentMainPhrase.equals("O")){
currentMainPhrase="I"+currentMainPhrase.substring(1);
currentPositionInMainPhrase++;
currentMainPhraseSpan[1]=tokens_parsed;
}
boolean inElem = false;
for (int cn = 0; cn < linput.length(); cn++) {
char cinput = linput.charAt(cn);
// Ignore blanks
if (cinput == ' ' || cinput == '\n' || cinput == '\r' || cinput == '\t') {
continue;
}
// Ignore *
if (cinput == '*') {
continue;
}
// Parse
full += cinput;
if (cinput == '(') {
if (inElem) {
if (element.length() < 1) {
throw new Exception("Empty element");
}
st.push(element);
// IMPORTANT: NOTE THAT ONLY SBAR IS INCLUDED NOT (S for example in (VP start (S calling
if (element.equals("SBAR")) {
currentSubsent = currentSubsent + sentnum;
sentnum = 1;
currentSubsent_parlevel = parlevel;
}
if((currentMainPhrase.equals("O") && element.matches("(PP|NP|VP|ADJP|ADVP|CONJP|WH(ADJP|AVP|PP|NP)|QP)"))
|| (currentMainPhrase.matches("(B|I)\\-VP") && element.matches("(PP|NP|ADJP|ADVP|CONJP|WH(ADJP|AVP|PP|NP)|QP)"))){
currentMainPhrase="B-"+element;
currentPositionInMainPhrase=1;
currentMainPhrase_parlevel=parlevel;
currentMainPhraseSpan[0]=tokens_parsed;
currentMainPhraseSpan[1]=tokens_parsed;
}
element = "";
} else {
inElem = true;
}
parlevel++;
} else {
if (cinput == ')') {
if(parlevel==currentMainPhrase_parlevel && !currentMainPhrase.endsWith("O")){
currentMainPhrase="O";
currentPositionInMainPhrase=0;
currentMainPhrase_parlevel=0;
// MEGA-HACK for two step parsing of one token
if(input.matches("[)]*"))
currentMainPhraseSpan[1]--;
// add an empty main phrase span
mainphrases_span.add(currentMainPhraseSpan.clone());
currentMainPhraseSpan=null;
currentMainPhraseSpan=new Integer[2];
}
parlevel--;
//if(!st.empty()){
st.pop();
//}
} else {
element += cinput;
}
}
}
if (inElem) {
if (element.length() < 1) {
throw new Exception("Empty element");
}
st.push(element);
if (element.equals("SBAR")) {
currentSubsent = currentSubsent + sentnum;
currentSubsent_parlevel = parlevel;
}
if((currentMainPhrase.equals("O") && element.matches("(PP|NP|VP|ADJP|ADVP|CONJP|WH(ADJP|AVP|PP|NP)|QP)"))
|| (currentMainPhrase.matches("(B|I)\\-VP") && element.matches("(PP|NP|ADJP|ADVP|CONJP|WH(ADJP|AVP|PP|NP)|QP)"))){
currentMainPhrase="B-"+element;
currentPositionInMainPhrase=1;
currentMainPhrase_parlevel=parlevel;
currentMainPhraseSpan[0]=tokens_parsed;
currentMainPhraseSpan[1]=tokens_parsed;
}
element = "";
}
// MEGA-HACK FOR TWO PARSES OF THE SAME TOKEN (CHECKING IF ITS ONLY CLOSING BRAKETS)
if(!input.matches("[)]*"))
tokens_parsed++;
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG")!=null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
}
public int getParlevel() {
return parlevel;
}
public String getCurrentSubsent() {
return currentSubsent;
}
public String getCurrentMainPhraseBIO() {
return currentMainPhrase;
}
public int getCurrentPositionInMainPhrase() {
return currentPositionInMainPhrase;
}
public String getCurrent() {
String[] st_arr = new String[st.size()];
String current = "";
st.copyInto(st_arr);
for (int i = 0; i < st_arr.length; i++) {
if (i != 0) {
current += "-";
}
current += st_arr[i];
}
return current;
}
public String updateCurrent(String past) {
String[] st_arr = new String[st.size()];
String current = "";
st.copyInto(st_arr);
for (int i = 0; i < st_arr.length; i++) {
if (i != 0) {
current += "-";
}
current += st_arr[i];
}
String[] past_arr = past.split("-");
String[] curr_arr = current.split("-");
String commonSynt = "";
int shortestSyntTree = curr_arr.length;
if (past_arr.length < shortestSyntTree) {
shortestSyntTree = past_arr.length;
}
for (int i = 0; i < shortestSyntTree; i++) {
if (curr_arr[i].equals(past_arr[i])) {
if (i != 0) {
commonSynt += "-";
}
commonSynt += curr_arr[i];
} else {
break;
}
}
return commonSynt;
}
// IMPORTANT: IT WAS THE ORIGINAL "getCurrentPhrase"
public String getCurrentPhrase(String synt) {
String[] synt_arr = synt.split("-");
String phraseSynt = "";
int phraseDept = 0;
for (int i = synt_arr.length - 1; i >= 0; i--) {
if (phraseDept == 0) {
if (synt_arr[i].equals("VP")) {
phraseSynt = synt_arr[i];
break;
} else {
phraseSynt = synt_arr[i];
phraseDept++;
}
} else {
// Originally uncommented
/*if (phraseDept >= 3) {
break;
}*/
if (synt_arr[i].equals("VP") || synt_arr[i].matches("S.*")) {
break;
} else {
phraseSynt = synt_arr[i] + "-" + phraseSynt;
phraseDept++;
}
}
}
return phraseSynt;
}
public String getCurrentMainPhrase(String synt) {
String[] synt_arr = synt.split("-");
String phraseSynt = "";
int phraseDept = 0;
for (int i = synt_arr.length - 1; i >= 0; i--) {
if (phraseDept == 0) {
if (synt_arr[i].equals("VP")) {
phraseSynt = synt_arr[i];
break;
} else {
phraseSynt = synt_arr[i];
phraseDept=1;
}
} else {
if (synt_arr[i].equals("VP") || synt_arr[i].matches("S.*")) {
break;
} else {
phraseSynt = synt_arr[i];
}
}
}
return phraseSynt;
}
public String getFull() {
return full;
}
public ArrayList<Integer []> getMainPhrasesSpan(){
return mainphrases_span;
}
public ArrayList<String> getAllPhrases() {
ArrayList<String> phrases = new ArrayList<String>();
try {
Stack localst = new Stack();
String linput = full.trim();
String element = "";
boolean inElem = false;
for (int cn = 0; cn < linput.length(); cn++) {
char cinput = linput.charAt(cn);
// Ignore blanks and *
if (cinput == ' ' || cinput == '\n' || cinput == '\r' || cinput == '\t' || cinput == '*') {
continue;
}
// Parse
if (cinput == '(') {
if (inElem) {
if (element.length() < 1) {
throw new Exception("Empty element");
}
localst.push(element);
getAllCurrentPhrases(localst, phrases);
element = "";
} else {
inElem = true;
}
} else {
if (cinput == ')') {
if (inElem) {
if (element.length() < 1) {
throw new Exception("Empty element");
}
localst.push(element);
getAllCurrentPhrases(localst, phrases);
element = "";
inElem = false;
}
localst.pop();
} else {
element += cinput;
}
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG")!=null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
return phrases;
}
public void getAllCurrentPhrases(Stack localst, ArrayList<String> phrases) {
String[] synt_arr = new String[localst.size()];
localst.copyInto(synt_arr);
String phraseSynt = "";
int phraseDept = 0;
for (int i = synt_arr.length - 1; i >= 0; i--) {
if (phraseDept == 0) {
if (synt_arr[i].equals("VP")) {
phraseSynt = synt_arr[i];
//phrases.add(phraseSynt);
break;
} else {
if(synt_arr[i].endsWith("P")){
phraseSynt = synt_arr[i];
//phrases.add(phraseSynt);
phraseDept++;
}else{
break;
}
}
} else {
if (phraseDept >= 3) {
break;
}
if (synt_arr[i].equals("VP") || synt_arr[i].matches("S.*")) {
break;
} else {
phraseSynt = synt_arr[i] + "-" + phraseSynt;
//phrases.add(phraseSynt);
phraseDept++;
}
}
}
if(!phraseSynt.equals("")){
phrases.add(phraseSynt);
}
}
}