package rainbownlp.analyzer.sentenceclause; import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import rainbownlp.core.Artifact; import rainbownlp.parser.DependencyLine; import rainbownlp.parser.StanfordParser; import rainbownlp.util.StanfordDependencyUtil; public class SentenceClauseManager { private Artifact relatedSentence; private String sentContent; private String posTags; private String stanDependenciesStr; public ArrayList<DependencyLine> sentDepLines = new ArrayList<DependencyLine>(); ArrayList<Clause> clauses; public HashMap<Integer, String> offsetMap = new HashMap<Integer, String>(); //this keeps the offsets as the key and the value is the lemma public HashMap<Integer, String> lemmaMap = new HashMap<Integer, String>(); // the same as above just has all the original tokens public HashMap<Integer, String> tokenMap = new HashMap<Integer, String>(); //the mapping of lemmas to originals //TODO: if we have repeated that are different originals it will be overwritten public HashMap<String, String> lemmaTokenMap = new HashMap<String, String>(); // this hash keep the location of each observed offset in the sentence public HashMap<Integer, Clause> clauseMap = new HashMap<Integer, Clause>(); ////////////////////////////////////// //this array will keep all the lines that the governor or dependent clause could not be resolved ArrayList<DependencyLine> phrases = new ArrayList<DependencyLine>(); public String filename; public String[] normalized_dependencies; public ArrayList<String> getPhrases() { ArrayList<String> phrase_strings = new ArrayList<String>(); for (DependencyLine depLine:phrases) { if (depLine.firstOffset<depLine.secondOffset) { phrase_strings.add(depLine.firstPart+" "+depLine.secondPart); } else { phrase_strings.add(depLine.secondPart+" "+depLine.firstPart); } } return phrase_strings; } public SentenceClauseManager(Artifact relatedSentence) throws Exception { setRelatedSentence(relatedSentence); setSentContent(relatedSentence.getContent()); setPosTags(relatedSentence.getPOS()); loadClauses(); } void loadClauses() throws Exception { if (relatedSentence.getPOS() ==null) { StanfordParser s_parser = new StanfordParser(); s_parser.parse(sentContent); setPosTags(s_parser.getTagged()); setStanDependenciesStr(s_parser.getDependencies()); } else { setPosTags(relatedSentence.getPOS()); setStanDependenciesStr(relatedSentence.getStanDependency()); } tokenMap = StanfordDependencyUtil.getTokens(posTags); //populate lemma lemmaMap = StanfordDependencyUtil.getLemmaMap(posTags); lemmaTokenMap = StanfordDependencyUtil.getLemmaTokenmaps(posTags); analyzeSentence(); } // TODO: generally improve this method, it is not perfect private void analyzeSentence() throws Exception { sentDepLines =StanfordDependencyUtil.parseDepLinesFromString(getStanDependenciesStr()); clauses = new ArrayList<Clause>(); Clause curClause = new Clause(); ArrayList<DependencyLine> toBeProcessesd = sentDepLines; for(int i=0; i<sentDepLines.size();i++) { DependencyLine curLine = sentDepLines.get(i); if(curLine.relationName == null) continue; offsetMap.put(curLine.firstOffset, curLine.firstPart); offsetMap.put(curLine.secondOffset, curLine.secondPart); if(curLine.relationName.equals("nsubj") || curLine.relationName.equals("xsubj")) { // if (curLine.firstOffset -curLine.secondOffset>10) // continue; Clause governor_cl = clauseMap.get(curLine.firstOffset); Artifact related_word = relatedSentence.getChildByWordIndex(curLine.firstOffset-1); String pos = related_word.getPOS(); //if the verb is already observed if (governor_cl !=null) { governor_cl.clauseSubject.add(curLine); clauseMap.put(curLine.secondOffset, governor_cl); } else { governor_cl = new Clause(); // subj and verb will be added to the new clause governor_cl.clauseSubject.add(curLine); if (pos!= null && (pos.startsWith("VB") || pos.startsWith("MD"))) { governor_cl.clauseVerb.verbMainPart = curLine.firstPart; governor_cl.clauseVerb.offset = curLine.firstOffset; clauseMap.put(curLine.firstOffset, governor_cl); clauseMap.put(curLine.secondOffset, governor_cl); } //TODO: process more else if(pos!= null && (pos.startsWith("JJ") || pos.startsWith("NN"))) { //if the relation cop also is present where the first part is the complement boolean is_comp = false; for (DependencyLine d:sentDepLines) { if (d.relationName.equals("cop") && d.firstOffset==curLine.firstOffset) is_comp = true; } if (is_comp==true) { governor_cl.complement = curLine.firstPart; governor_cl.complementOffset = curLine.firstOffset; clauseMap.put(curLine.firstOffset, governor_cl); clauseMap.put(curLine.secondOffset, governor_cl); } } } //get all dep lines that are related to this } if(curLine.relationName.equals("dobj")|| curLine.relationName.equals("iobj")|| curLine.relationName.equals("nsubjpass")) { Clause governor_cl = getGovernorVerbOrComplement(curLine); // String dep_tag = getPOSTag(curLine.secondOffset); // if (dep_tag != null && dep_tag.startsWith("JJ")) // { // governor_cl.complement = curLine.secondPart; // governor_cl.complementOffset= curLine.secondOffset; // // } // else // { SentenceObject new_object = new SentenceObject(); new_object.content = curLine.secondPart; new_object.contentOffset = curLine.secondOffset; governor_cl.clauseObject.add(new_object); // } clauseMap.put(curLine.secondOffset, governor_cl); } if(curLine.relationName.equals("cop")) { Clause governor = clauseMap.get(curLine.firstOffset); Clause dependent = clauseMap.get(curLine.secondOffset); if (governor != null ||dependent != null) { // it means that we have observed the verb if (dependent != null && governor == null) { dependent.complement = curLine.firstPart; dependent.complementOffset = curLine.firstOffset; clauseMap.put(curLine.firstOffset, dependent); } else if(governor != null) { governor.clauseVerb.verbMainPart = curLine.secondPart; governor.clauseVerb.offset = curLine.secondOffset; clauseMap.put(curLine.secondOffset, governor); } } //we should add the verb and the complement else { curClause = new Clause(); // complement and verb will be added to the new clause curClause.complement =curLine.firstPart; curClause.complementOffset = curLine.firstOffset; curClause.clauseVerb.verbMainPart = curLine.secondPart; curClause.clauseVerb.offset = curLine.secondOffset; clauseMap.put(curLine.firstOffset, curClause); clauseMap.put(curLine.secondOffset, curClause); } } // toBeProcessesd.remove(i); } // xcomp, ccomp for(int i=0; i<toBeProcessesd.size();i++) { DependencyLine curLine = sentDepLines.get(i); handleComp(curLine); // toBeProcessesd.remove(i); } // for(DependencyLine curLine:sentDepLines) // { // handleComp(curLine); // } // for(DependencyLine curLine:sentDepLines) // { // handleVerbDependencies(curLine); // handleNegation(curLine); // handleModifiers(curLine); // handleIobj(curLine); // handleMarks(curLine); // } for(int i=0; i<toBeProcessesd.size();i++) { DependencyLine curLine = sentDepLines.get(i); handleVerbDependencies(curLine); handleNegation(curLine); handleModifiers(curLine); handleIobj(curLine); handleMarks(curLine); // toBeProcessesd.remove(i); } for(int i=0; i<toBeProcessesd.size();i++) { DependencyLine curLine = sentDepLines.get(i); handleNPClMod(curLine); } //add unique sentence clauses to clause for (Clause c : clauseMap.values()) { if (!clauses.contains(c) && c!= null) { clauses.add(c); } } } void handleComp(DependencyLine curLine) throws SQLException { //“He says that you like to swim” ccomp(says, like) Artifact related_word = relatedSentence.getChildByWordIndex(curLine.secondOffset-1); String d_tag= related_word.getPOS(); if(curLine.relationName.equals("ccomp")|| curLine.relationName.equals("xcomp")) { Clause governor_clause= clauseMap.get(curLine.firstOffset); Clause dependent_clause = clauseMap.get(curLine.secondOffset); if (clauseMap.containsKey(curLine.firstOffset)&& clauseMap.containsKey(curLine.secondOffset)) { governor_clause.clauseComplements.add(dependent_clause); dependent_clause.governer = governor_clause; // if (d_tag.startsWith("JJ")) // { // governor_clause.complement = curLine.secondPart; // governor_clause.complementOffset = curLine.secondOffset; // } } else if (clauseMap.containsKey(curLine.firstOffset)&& !clauseMap.containsKey(curLine.secondOffset)) { dependent_clause = new Clause(); if (d_tag != null && d_tag.startsWith("VB")) { dependent_clause.clauseVerb.verbMainPart =curLine.secondPart; dependent_clause.clauseVerb.offset =curLine.secondOffset; clauseMap.put(curLine.secondOffset, dependent_clause); governor_clause.clauseComplements.add(dependent_clause); dependent_clause.governer = governor_clause; } // if (d_tag.startsWith("JJ")) // { // governor_clause.complement = curLine.secondPart; // governor_clause.complementOffset = curLine.secondOffset; // // } } else if (!clauseMap.containsKey(curLine.firstOffset)&& clauseMap.containsKey(curLine.secondOffset)) { governor_clause = getGovernorVerbOrComplement(curLine); ArrayList<Clause> cl_comps = new ArrayList<Clause>(); if (!governor_clause.clauseComplements.isEmpty()) { cl_comps = governor_clause.clauseComplements; } cl_comps.add(dependent_clause); governor_clause.clauseComplements = cl_comps; } else if (!clauseMap.containsKey(curLine.firstOffset)&& !clauseMap.containsKey(curLine.secondOffset)) { //create both clauses and add governor_clause = getGovernorVerbOrComplement(curLine); dependent_clause = new Clause(); if (d_tag.startsWith("VB")) { dependent_clause.clauseVerb.verbMainPart = curLine.secondPart; dependent_clause.clauseVerb.offset = curLine.secondOffset; clauseMap.put(curLine.secondOffset, dependent_clause); } // else // { // dependent_clause.complement = curLine.secondPart; // dependent_clause.complementOffset = curLine.secondOffset; // } // clauseMap.put(curLine.secondOffset, dependent_clause); } } else { // throw exception } } void handleVerbDependencies(DependencyLine depLine) throws SQLException { if(depLine.relationName.equals("prt")|| depLine.relationName.equals("aux") || depLine.relationName.equals("auxpass")) { Clause governor_clause = getGovernorVerbOrComplement(depLine); if(depLine.relationName.equals("aux") || depLine.relationName.equals("auxpass")) { governor_clause.clauseVerb.auxs.add(depLine.secondPart); if (depLine.relationName.equals("auxpass")) { governor_clause.clauseVerb.isPassive = true; } } else if(depLine.relationName.equals("prt")) { governor_clause.clauseVerb.prt = depLine.secondPart; } clauseMap.put(depLine.secondOffset, governor_clause); } } void handleNegation(DependencyLine depLine) throws SQLException { if(depLine.relationName.equals("neg")) { Clause governor = getGovernorVerbOrComplement(depLine); if (governor.clauseVerb.offset == depLine.firstOffset) { governor.clauseVerb.isNegated = true; } governor.isNegated = true; clauseMap.put(depLine.secondOffset, governor); } if(depLine.relationName.equals("det") && depLine.secondPart.equalsIgnoreCase("no") ) { Clause governor = clauseMap.get(depLine.firstOffset); if (governor != null) { ArrayList<String> modifiers = new ArrayList<String>(); if (governor.modifierDepMap.containsKey(depLine.firstOffset)) { modifiers = governor.modifierDepMap.get(depLine.firstOffset); } modifiers.add(depLine.secondPart); governor.modifierDepMap.put(depLine.firstOffset,modifiers); governor.isNegated = true; clauseMap.put(depLine.secondOffset, governor); } else { phrases.add(depLine); } } } Clause getGovernorVerbOrComplement(DependencyLine depLine) throws SQLException { Clause governor_clause = clauseMap.get(depLine.firstOffset); boolean create_new_required =false; //if the governor is supposed to be verb but the content of existing is not equal if (governor_clause != null) { Artifact related_word = relatedSentence.getChildByWordIndex(depLine.firstOffset-1); String g_tag = related_word.getPOS(); if (g_tag!= null && (g_tag.startsWith("VB") || g_tag.startsWith("MD"))) { if (governor_clause.clauseVerb.offset != depLine.firstOffset) { create_new_required =true; } } } if (governor_clause == null || create_new_required) { governor_clause = new Clause(); Artifact related_word =relatedSentence.getChildByWordIndex(depLine.firstOffset-1); String g_tag = related_word.getPOS(); if (g_tag!= null && (g_tag.startsWith("VB") || g_tag.startsWith("MD"))) { governor_clause.clauseVerb.verbMainPart = depLine.firstPart; governor_clause.clauseVerb.offset = depLine.firstOffset; } else//TODO:it shoule be checked more { governor_clause.complement = depLine.firstPart; governor_clause.complementOffset = depLine.firstOffset; } clauseMap.put(depLine.firstOffset, governor_clause); } return governor_clause; } void handleModifiers(DependencyLine depLine) throws SQLException { if (!(depLine.relationName.equals("amod")|| depLine.relationName.equals("advmod") || depLine.relationName.equals("dep") || depLine.relationName.equals("nn") || depLine.relationName.equals("det") || depLine.relationName.equals("tmod") || depLine.relationName.equals("poss") || depLine.relationName.startsWith("prepc_") || depLine.relationName.startsWith("prep_"))) { return; } // TODO: may nor working fine if (depLine.relationName.startsWith("prep_")) { Artifact related_word =relatedSentence.getChildByWordIndex(depLine.firstOffset-1); String gov_pos = related_word.getPOS(); // if (!gov_pos.startsWith("NN")) // { // return; // } } Clause governor_cl = clauseMap.get(depLine.firstOffset); Clause dependent_cl = clauseMap.get(depLine.secondOffset); if (governor_cl == null) { //TODO: Find a solid solution.... //try to find the related clause of the current governor List<DependencyLine> related_dep_lines = StanfordDependencyUtil.getAllGovernors(sentDepLines, depLine.firstPart); for (DependencyLine rel_dep:related_dep_lines) { if(rel_dep.secondOffset==depLine.firstOffset) { governor_cl = clauseMap.get(rel_dep.firstOffset); break; } } governor_cl = findMissingClause(depLine); //if it is still not found if (governor_cl==null) { phrases.add(depLine); } } if (governor_cl != null && governor_cl != null) { ArrayList<String> modifiers = new ArrayList<String>(); if(governor_cl.modifierDepMap.containsKey(depLine.firstOffset)) { modifiers =governor_cl.modifierDepMap.get(depLine.firstOffset); } modifiers.add(depLine.secondPart); governor_cl.modifierDepMap.put(depLine.firstOffset, modifiers); if (depLine.relationName.equals("amod")|| depLine.relationName.equals("advmod") || depLine.relationName.equals("nn") ) { governor_cl.adjModifierDepMap.put(depLine.firstOffset, modifiers); } if (dependent_cl==null) { clauseMap.put(depLine.secondOffset, governor_cl); } } } void handleNPClMod(DependencyLine depLine) throws SQLException { if (!(depLine.relationName.equals("infmod") || depLine.relationName.equals("rcmod"))) { return; } Clause governor_cl = clauseMap.get(depLine.firstOffset); Clause dependent_cl = clauseMap.get(depLine.secondOffset); if (governor_cl == null) { governor_cl = findMissingClause(depLine); //if it is still not found } if (governor_cl==null) { phrases.add(depLine); } else if ((dependent_cl != null && governor_cl !=dependent_cl)) { governor_cl.clauseComplements.add(dependent_cl); dependent_cl.governer = governor_cl; } else if (dependent_cl == null) { // try to build it dependent_cl = buildDependentClause(depLine); if (dependent_cl != null) { clauseMap.put(depLine.secondOffset, dependent_cl); governor_cl.clauseComplements.add(dependent_cl); dependent_cl.governer = governor_cl; } else//this should not happen { phrases.add(depLine); } } //they should be different if (dependent_cl==governor_cl) { //get all governors of the second part List<DependencyLine> governing_dep_lines = StanfordDependencyUtil.getAllGovernors(sentDepLines, depLine.firstPart, depLine.firstOffset); // from there select other one if exist for(DependencyLine dep:governing_dep_lines) { if(dep.relationName.equals("nsubj") || dep.relationName.equals("xsubj")|| dep.relationName.equals("dobj")|| dep.relationName.equals("iobj")|| dep.relationName.equals("nsubjpass")|| dep.relationName.equals("cop")) { //if it is different form the cottenr clause if (dep.firstOffset != depLine.secondOffset) { Clause new_cl = clauseMap.get(dep.firstOffset); if (new_cl!=null) { clauseMap.put(depLine.firstOffset, new_cl); } } } } } } private Clause findMissingClause(DependencyLine depLine) { Clause cl= null; List<DependencyLine> related_dep_lines = StanfordDependencyUtil.getAllGovernors(sentDepLines, depLine.firstPart,depLine.firstOffset); for (DependencyLine rel_dep:related_dep_lines) { cl = clauseMap.get(rel_dep.firstOffset); break; } return cl; } //needs to be completed Clause buildDependentClause(DependencyLine depLine) throws SQLException { Artifact related_word =relatedSentence.getChildByWordIndex(depLine.secondOffset-1); String d_pos = related_word.getPOS(); Clause dependent_clause =null; if (d_pos != null && (d_pos.startsWith("VB") || d_pos.startsWith("MD"))) { dependent_clause = new Clause(); dependent_clause.clauseVerb.verbMainPart = depLine.secondPart; dependent_clause.clauseVerb.offset = depLine.secondOffset; } else if (d_pos != null && d_pos.startsWith("JJ") ) { dependent_clause = new Clause(); dependent_clause.complement = depLine.secondPart; dependent_clause.complementOffset = depLine.secondOffset; } return dependent_clause; } void handleIobj(DependencyLine depLine) throws SQLException { if (!(depLine.relationName.startsWith("prep_"))) { return; } // if governor is a noun it is handled in modifier Artifact related_word =relatedSentence.getChildByWordIndex(depLine.firstOffset-1); String gov_pos = related_word.getPOS(); if (gov_pos.startsWith("NN")) { return; } Clause gov_cl = clauseMap.get(depLine.firstOffset); Clause dep_cl = clauseMap.get(depLine.secondOffset); if(gov_cl != null && dep_cl!= null ) { SentenceObject indirect_object_cl = new SentenceObject(); indirect_object_cl.clause = dep_cl; gov_cl.clauseIObjPrep.put(indirect_object_cl,getPrep(depLine.relationName) ); gov_cl.clauseIObjs.add(depLine.secondPart); } else if (gov_cl != null && dep_cl== null) { SentenceObject indirect_object = new SentenceObject(); indirect_object.content = depLine.secondPart; indirect_object.contentOffset = depLine.secondOffset; gov_cl.clauseIObjPrep.put(indirect_object,getPrep(depLine.relationName) ); gov_cl.clauseIObjs.add(depLine.secondPart); } else { phrases.add(depLine); } } void handleMarks(DependencyLine depLine) { if (!(depLine.relationName.equals("mark"))) { return; } Clause gov_cl = clauseMap.get(depLine.firstOffset); if(gov_cl != null) { gov_cl.isMarked = true; gov_cl.clauseMark = depLine.secondPart; clauseMap.put(depLine.secondOffset, gov_cl); } else { phrases.add(depLine); } } public String getPrep(String rel_name) { String prep = null; Pattern p = Pattern.compile("prep_(\\w+)"); Matcher m = p.matcher(rel_name); if(m.matches()) { prep = m.group(1); } return prep; } String getConj(String rel_name) { String conj = null; Pattern p = Pattern.compile("conj_(\\w+)"); Matcher m = p.matcher(rel_name); if(m.matches()) { conj = m.group(1); } return conj; } void handleConjuction(DependencyLine depLine) { if(!depLine.relationName.startsWith("conj_")) { return; } Clause dep_cl = clauseMap.get(depLine.secondOffset); if (dep_cl != null) { dep_cl.conjuctedBut = true; } else { phrases.add(depLine); } } //it gets an offset as input and returns next lemmatized tokens public ArrayList<String> getNextLemmaTokens(Integer offset,Integer token_count) { ArrayList<String> next_tokens = new ArrayList<String>(); Integer sent_token_count = lemmaMap.size(); if (lemmaMap.containsKey(offset)) { for (int i = offset+1; i <= token_count+ offset && i<sent_token_count ; i++) { if(lemmaMap.containsKey(i)) { next_tokens.add(lemmaMap.get(i)); } } } return next_tokens; } // it gets an offset as input and returns next lemmatized tokens public ArrayList<String> getPreviousLemmaTokens(Integer offset,Integer token_count) { ArrayList<String> prev_tokens = new ArrayList<String>(); if (lemmaMap.containsKey(offset)) { for (int i = offset-1; i >= offset-token_count && i>=0 ; i--) { if(lemmaMap.containsKey(i)) { prev_tokens.add(lemmaMap.get(i)); } } } return prev_tokens; } // it gets an offset as input and returns the around lemmatized tokens public ArrayList<String> getArroundLemmaTokens(Integer offset,Integer token_count) { ArrayList<String> arround_tokens = new ArrayList<String>(); Integer sent_token_count = lemmaMap.size(); if (lemmaMap.containsKey(offset)) { for (int i = offset-1; i >= offset-token_count && i>=0 ; i--) { if(lemmaMap.containsKey(i)) { arround_tokens.add(lemmaMap.get(i)); } } } if (lemmaMap.containsKey(offset)) { for (int i = offset+1; i <= token_count+ offset && i<sent_token_count ; i++) { if(lemmaMap.containsKey(i)) { arround_tokens.add(lemmaMap.get(i)); } } } return arround_tokens; } public String getPOSTag(Integer offset) { if (offset <1) { return "missing"; } String pos = posTags.split(" ")[offset-1].split("/")[1]; return pos; } public ArrayList<Clause> getClauses() { return clauses; } public String getContent() { return getRelatedSentence().getContent(); } public SentenceClauseManager() { } public void setRelatedSentence(Artifact relatedSentence) { this.relatedSentence = relatedSentence; } public Artifact getRelatedSentence() { return relatedSentence; } public void setSentContent(String sentContent) { this.sentContent = sentContent; } public String getSentContent() { return sentContent; } public void setPosTags(String posTags) { this.posTags = posTags; } public String getPosTags() { return posTags; } public void setStanDependenciesStr(String stanDependenciesStr) { this.stanDependenciesStr = stanDependenciesStr; } public String getStanDependenciesStr() { return stanDependenciesStr; } }