/* $Id: ChunkedSentence.java 988 2011-09-23 16:44:53Z hong1.cui $ */
/**
*
*/
package fna.charactermarkup;
import java.lang.reflect.Constructor;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.xpath.XPath;
import outputter.knowledge.TermOutputerUtilities;
import conceptmapping.*;
/**
*
* @author hongcui
* This class generates a chunked sentence from the parsing tree and provides a set of access methods to facilitate final annotation.
* A chunked sentence is a marked sentence (with organs enclosed by <> and states by {}) with "chunks" of text enclosed by [], for example
* <Heads> 3 , {erect} , [in corymbiform or paniculiform arrays]. (sent. 302)
*
* the annotation of a chunk may require access to the original parsing tree, but that is not handled by this class.
*/
@SuppressWarnings("unchecked")
public class ChunkedSentence {
private String glosstable = null;
private String markedsent = null;
private String chunkedsent = null;
private ArrayList<String> chunkedtokens = null;
@SuppressWarnings("unused")
private ArrayList<String> charactertokensReversed = new ArrayList<String>();
private int pointer = 0; //pointing at the next chunk to be annotated
private String subjecttext = null;
private String text = null;
private String sentsrc = null;
private String tableprefix = null;
private Element root;
public static final String binaryTvalues = "true|yes|usually";
public static final String binaryFvalues = "false|no|rarely";
public static final String pronouns = "them";
public static final String locationpp="near|from";
public static final String units= "cm|mm|dm|m|meter|meters|microns|micron|unes|�m|um";
public static final String percentage="%|percent";
public static final String degree="�|degree|degrees";
public static final String times = "times|folds|lengths|widths";
public static final String per = "per";
public static final String more="greater|more|less|fewer";
public static final String counts="few|several|many|none|numerous|single|couple";
public static final String basecounts="each|every|per";
public static final String pairs="pair|pairs|series|array|arrays|row|rows";
public static final String clusters="cluster|clusters|involucre|involucres|rosette|rosettes|pair|pairs|series|ornament|ornamentation|array|arrays";
public static final String prepositions = "above|across|after|along|among|amongst|around|as|at|before|behind|beneath|between|beyond|by|for|from|in|into|near|of|off|on|onto|out|outside|over|than|throughout|to|toward|towards|up|upward|with|without";
public static final String stop = "a|about|above|across|after|along|also|although|amp|an|and|are|as|at|be|because|become|becomes|becoming|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|for|from|had|has|have|hence|here|how|if|in|into|inside|inward|is|it|its|may|might|more|most|near|no|not|of|off|on|onto|or|out|outside|outward|over|should|so|than|that|the|then|there|these|this|those|throughout|to|toward|towards|up|upward|was|were|what|when|where|which|why|with|within|without|would";
public static final String skip = "and|becoming|if|or|that|these|this|those|to|what|when|where|which|why|not|throughout";
public static final String positionprep = "of|part_of|in|on|between";
public static final String asasthan = "long|wide|broad|tall|high|deep|short|narrow|thick"; //as-long-as wide
public static final String size="long|longer|wide|wider|broad|broader|tall|taller|high|higher|deep|deeper|short|shorter|narrow|narrower|thick|thicker|length|width|height|depth|breadth";
public static Hashtable<String, String> eqcharacters = new Hashtable<String, String>();
private boolean inSegment = false;
private boolean rightAfterSubject = false;
private int sentid = -1;
private ArrayList<String> pastpointers = new ArrayList<String>();
public String unassignedmodifier = null;
//caches
public static Hashtable<String, String> characterhash = new Hashtable<String, String>();
public static ArrayList<String> adverbs = new ArrayList<String>();
public static ArrayList<String> verbs = new ArrayList<String>();
public static ArrayList<String> nouns = new ArrayList<String>();
public static ArrayList<String> notadverbs = new ArrayList<String>();
public static ArrayList<String> notverbs = new ArrayList<String>();
public static ArrayList<String> notnouns = new ArrayList<String>();
protected Connection conn = null;
/*static protected String username = "root";
static protected String password = "root";
static protected String database = "fnav19_benchmark";*/
private boolean printNorm = false;
private boolean printNormThan = false;
private boolean printNormTo = false;
private boolean printExp = false;
private boolean printRecover = false;
private String clauseModifierConstraint;
private String clauseModifierContraintId;
private String type;
private String characters;
public ChunkedSentence(ArrayList<String> chunkedtokens, String chunkedsent, Connection conn, String glosstable, String tableprefix){
this.chunkedtokens = chunkedtokens;
this.chunkedsent = chunkedsent;
this.conn = conn;
this.glosstable = glosstable;
this.tableprefix = tableprefix;
this.recoverOrgans();
}
/**
* @param tobechunkedmarkedsent
* @param tree
*
*/
public ChunkedSentence(int id, Document collapsedtree, Document tree, String tobechunkedmarkedsent, String sentsrc, String type, String tableprefix,Connection conn, String glosstable, String characters) throws Exception {
eqcharacters.put("wide", "width"); //2 cm. wide
eqcharacters.put("long", "length");
eqcharacters.put("broad", "width");
eqcharacters.put("diam", "diameter");
eqcharacters.put("size", "size");
eqcharacters.put("high", "height");
eqcharacters.put("height", "height");
eqcharacters.put("width", "width");
eqcharacters.put("length", "length");
eqcharacters.put("depth", "depth");
eqcharacters.put("breadth", "width");
this.tableprefix = tableprefix;
this.glosstable = glosstable;
this.characters = characters;
this.conn = conn;
this.type = type;
/*try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select term from "+glosstable+" where category='character'");
while(rs.next()){
nouns.add(rs.getString("term"));
}
}catch(Exception e){
e.printStackTrace();
}*/
nouns.addAll(Arrays.asList(characters.split("\\|")));
this.sentsrc = sentsrc;
this.sentid = id;
this.markedsent = tobechunkedmarkedsent;
//tobechunkedmarkedsent = tobechunkedmarkedsent.replaceAll("[\\[\\(]", " -LRB-/-LRB- ").replaceAll("[\\]\\)]", " -RRB-/-RRB- ").replaceAll("\\s+", " ").trim();
tobechunkedmarkedsent = tobechunkedmarkedsent.replaceAll("[\\[\\(]", "-LRB-/-LRB-").replaceAll("[\\]\\)]", "-RRB-/-RRB-").trim();
if(tobechunkedmarkedsent.matches(".*?\\d.*")){
tobechunkedmarkedsent = NumericalHandler.normalizeNumberExp(tobechunkedmarkedsent);
}
String[] temp = tobechunkedmarkedsent.split("\\s+");
chunkedtokens = new ArrayList<String>(Arrays.asList(temp)); //based on markedsent, which provides <>{} tags.
root = collapsedtree.getRootElement();
String treetext = SentenceChunker4StanfordParser.allText(root).trim();
String[] treetoken = treetext.split("\\s+"); //based on the parsing tree, which holds some chunks.
String realchunk = "";
ArrayList<String> brackets = new ArrayList<String>();
int i = 0;
//go through treetoken to chunk state lists, and brackets
for(; i<treetoken.length; i++){
if(treetoken[i].matches("^\\S+~list~\\S+")){//r[p[of] o[{architecture~list~smooth~or~barbellulate~to~plumose} (bristles)]]
//String[] parts = treetoken[i].split("~list~");
//treetoken[i] = parts[0]+"["+parts[1]+"]";
//treetoken[i] = treetoken[i].replace("~list~", "[{").replaceAll("\\{(?=\\w{2,}\\[)", "").replaceAll("(?<=~[a-z0-9-]{2,40})(\\}| |$)","}]");
treetoken[i] = treetoken[i].replace("~list~", "[{").replaceAll("\\{(?=\\w{2,}\\[)", "").replaceAll("(?<=~[a-z0-9-]{1,40})(\\}| |$)","}]");
}
}
for(i= 0; i<treetoken.length; i++){
if(treetoken[i].indexOf('[') >=0){
int bcount = treetoken[i].replaceAll("[^\\[]", "").trim().length();
for(int j = 0; j < bcount; j++){
brackets.add("[");
}
}
if(brackets.size()>0){//in
//restore original number expressions
String w = treetoken[i].replaceAll("(\\w+\\[|\\])", "");
realchunk += treetoken[i].replace(w, chunkedtokens.get(i))+" ";
chunkedtokens.set(i, "");
}
if(treetoken[i].indexOf(']')>=0){
int bcount = treetoken[i].replaceAll("[^\\]]", "").trim().length();
for(int j = 0; j < bcount; j++){
brackets.remove(0);
}
}
if(brackets.size()==0 && realchunk.length()>0){
chunkedtokens.set(i, realchunk.replaceAll("<", "(").replaceAll(">", ")").trim()); //inside a chunk, an organ is marked by #. e.g. #leaves#
realchunk="";
}
}
if(realchunk.length()>0){
chunkedtokens.set(i-1+0, realchunk.trim());
}
this.chunkedsent = "";
int discoveredchunks = 0;
discoveredchunks += normalizeThan();//do Than first before OtherINs
/*OtherINs first: r[p[equal-to] o[or {greater} than {depth}]] r[p[of] o[{adjacent} (prearticular)]] .
*Than first: {equal-to} or n[{greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]] . This is desired results for ChunkTHAN
*Besides, it is important to group all Than cases as ChunkTHAN, not split them between ChunkPrep and ChunkTHAN
*/
discoveredchunks += normalizeOtherINs(); //find objects for those VB/IN that without
discoveredchunks += normalizeBetween();
//discoveredchunks += normalizeThan();
discoveredchunks += normalizeTo();
normalizeUnits();
normalizePPList4Than(); //take care of orphaned 'equal-to or' 'as-short-as or'
normalizeAsAsThan();
int allchunks = chunks();
StanfordParser.countChunks(allchunks, discoveredchunks);
recoverSubjectOrgan4Character();
recoverVPChunks();//recover unidentified verb phrases
recoverConjunctedOrgans(); //
//findSubject(); no longer needed //set the pointer to a place right after the subject, assuming the subject part is stable in chunkedtokens at this time
recoverOrgans();
recoverCharacter4OrganList();
segmentSent();//insert segment marks in chunkedtokens while producing this.chunkedsent
//TODO move this to an earlier place
//if the last words in l[] are marked with {}, take them out of the chunk
//if(this.chunkedsent.matches(".*?l\\[[^\\[].*?}\\].*")){
// removeStateFromList();
//}
}
/**
* 1. as-long-as wide
* 2. as-long-as organ
* 3. as-long-as width of organ
* form n[] chunk
*/
private void normalizeAsAsThan() {
for(int i = 0; i< this.chunkedtokens.size(); i++){
String token = this.chunkedtokens.get(i);
String chunk = token+" ";
boolean success = false;
if(token.matches("\\{?as-("+ChunkedSentence.asasthan+")-as\\}?")){//{as-long-as}: treat these as ChunkTHAN
//looking for the 2nd part
int j = 0; String t = "";
for(j = i+1; j<this.chunkedtokens.size(); j++){
t = this.chunkedtokens.get(j);
if(t.length()!=0) break;
}
if(t.matches("\\{?("+ChunkedSentence.asasthan+")\\}?")){ //case 1
chunk +=t+" ";
success = true;
}
else if(t.matches("\\{?(height|width|length|depth|thickness)\\}?")){ //case 3
chunk +=t+" ";
for(int k = j+1; k < this.chunkedtokens.size(); k++){
if(this.chunkedtokens.get(k).length()==0) continue;
if(this.chunkedtokens.get(k).startsWith("r[p[of")){
chunk += this.chunkedtokens.get(k)+" ";
j = k;
success = true;
break;
}
}
}
if(!success){
//case 2
while(!t.startsWith("(") && !t.equals(",")){//found bony in {bony} (portion)
chunk +=t+" ";
if(j < this.chunkedtokens.size()-1) t = this.chunkedtokens.get(++j);
else break;
success = true;
}
while((t.length()==0 || t.startsWith("("))){ //found (portion)
chunk +=t+" ";
if(j < this.chunkedtokens.size()-1) t = this.chunkedtokens.get(++j);
else break;
}
}
//form n[chunk]
if(success){
this.chunkedtokens.set(i, "n["+chunk.trim()+"]");
for(int k=i+1; k<=j; k++){
this.chunkedtokens.set(k, "");
}
}
}
}
}
/**
* turn {equal-to} or n[{greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]]
* to n[{equal-to} or {greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]]
*
* note cases like n[less than or {equal-to} 35 {percent}] are already in the desired form.
*/
private void normalizePPList4Than() {
//search for n[] in
for(int i = 0; i< this.chunkedtokens.size(); i++){
if(this.chunkedtokens.get(i).startsWith("n[")){
//search back to include proceeding prepositions
String preps = "";
int j;
for(j = i-1; j >= 0; j--){
String token = this.chunkedtokens.get(j);
if(token.length()==0) continue;
if(token.startsWith("r[") && token.indexOf("o[")<0){
preps = token+" "+preps;
}else if(token.replaceAll("[{}]", "").matches(ChunkedSentence.prepositions+"|"+POSTagger4StanfordParser.comprepstring+"|as-("+ChunkedSentence.asasthan+")-as")){//equal-to, as-long-as
preps = token+" "+preps;
}else if(token.matches("or|,")){
preps = token+" "+preps;
}else{//encounter first non-prep part, end search
break;
}
}
preps = preps.trim();
while(preps.startsWith("or") || preps.startsWith(",")){
//remove the leading (or|,)
preps = preps.replaceFirst("^(or|,)($| )", ""); //preps could be just "or" --in some wired sentences
j++;
}
if(preps.length()>0){
for(int k = j+1; k<i; k++){
this.chunkedtokens.set(k, "");
}
this.chunkedtokens.set(i, "n["+preps+" "+this.chunkedtokens.get(i).replaceFirst("n\\[", ""));
}
}
}
}
/**
* contact between organ a and organ b
* @throws Exception
*
*
*/
private void recoverSubjectOrgan4Character() throws Exception {
//if type is character and the first non-empty chunk is not a noun
if(type.equals("character")){
if(this.chunkedtokens.size()<=2 && this.chunkedtokens.get(0).matches("\\w+")){
if(this.chunkedtokens.size()>1 && !this.chunkedtokens.get(1).matches("\\w+")){
this.chunkedtokens.set(0, "<"+this.chunkedtokens.get(0)+">");
return;
}
}
String token = "";
int i = 0;
while(token.length()==0 && i < this.chunkedtokens.size()){
token = this.chunkedtokens.get(i);
i++;
}
for(int j = i-1; j < this.chunkedtokens.size(); j++){ //process the leading bare tokens
token = this.chunkedtokens.get(j);
if(token.length()==0 || token.indexOf("[")>0 || token.indexOf("<")>=0 || token.indexOf("{")>=0) break;
if(XPath.selectNodes(root, "//NN[@text='"+token+"']").size() > 0 &&
!token.matches("("+this.characters+")")){//bare token
token = "<"+token+">";
this.chunkedtokens.set(j, token);
}
}
}
}
/**
* count the chunks in chunkedtokens
* @return
*/
private int chunks() {
int count = 0;
Iterator<String> it = this.chunkedtokens.iterator();
while(it.hasNext()){
if(it.next().matches("[^l]\\[.*")){
count++;
}
}
return count;
}
/**
* scan through a chunkedtokens to find Verbs not parsed as such by the parser
* find verbs by
* 1. look into this.verbs
* 2. find pattern o ting/ted by o, then t must be a verb and save this verb in verbs
*/
private void recoverVPChunks() {
for(int i = 0; i < this.chunkedtokens.size(); i++){
String t = this.chunkedtokens.get(i);
if(t.contains("-")) continue; //check 751
if(!t.contains("[") && ChunkedSentence.verbs.contains(t)){
recoverVPChunk(i);
}else if(!t.contains("[") && (t.endsWith("ing")|| t.endsWith("ing}"))){
if(connects2organs(i)){
ChunkedSentence.verbs.add(t.replaceAll("\\W", ""));
recoverVPChunk(i);
}
}/*else if(!t.contains("[")&& t.endsWith("ed") && this.chunkedtokens.size()>i+1 && this.chunkedtokens.get(i+1).matches(".*?\\bby\\b.*")){
}*/
}
}
/**
*
* @param i :index of the verb
* @return
*/
private boolean connects2organs(int i) {
boolean organ1 = false;
boolean organ2 = false;
if(i>=1 && this.chunkedtokens.size()>i+1){
String t = this.chunkedtokens.get(i-1);
if(t.endsWith(">") || t.matches(".*\\bo\\[[^\\]\\[]*\\]+") || t.endsWith(")") ){
organ1 = true;
}
do{
i++;
t = this.chunkedtokens.get(i).trim();
}while(t.length()==0);
if(t.endsWith(">") || t.matches("[uz]?\\[?\\bo\\[[^\\]\\[]*\\]+") || t.endsWith(")") ){
organ2 = true;
}
/*for(int j = i+1; j < this.chunkedtokens.size(); j++){
t = this.chunkedtokens.get(j);
if(t.endsWith(">") || t.matches("[uz]?\\[?\\bo\\[[^\\]\\[]*\\]+") || t.endsWith(")") ){
organ2 = true;
break;
}
if((j == i+1 && t.equals(","))|| t.matches("\\w+")){
organ2 = false;
break;
}
}*/
}
return organ1 && organ2;
}
/**
*
* @param i: the index of a possible verb
*/
private void recoverVPChunk(int i) {
String chunk = "";
boolean foundo = false;
int j = i+1;
for(; j < chunkedtokens.size(); j++){
//scan for the end of the chunk TODO: may refactor with normalizeOtherINs on this search
String t = this.chunkedtokens.get(j);
if(j==i+1 && t.matches(",")){ //verb not have object
return;
}
if(t.matches("(;|\\.)")) break;
if(foundo && (t.contains("{") || t.contains("~list~")||t.matches("(\\w+|,|;|\\.)")||t.contains("["))){
break;
}
if(t.contains("<")){
chunk += t+" ";
foundo = true;
}else if(t.matches(".*?\\bo\\[[^\\]*]+") || t.matches(".*?l\\[[^\\]]*\\]+")){//found noun)
chunk += t+" ";
foundo = true;
j++;
break;
}else{
chunk += t+" ";
}
}
if(!foundo) return;
//format the chunk
chunk = chunk.trim();
if(chunk.endsWith(">")){
chunk = "b[v["+this.chunkedtokens.get(i)+"]"+" o["+chunk.replaceAll("<", "(").replaceAll(">", ")")+"]]";
}else if(chunk.matches(".*?\\bo\\[.*\\]+")){
if(chunk.contains(" v[")){
chunk = chunk.replaceFirst(" v[", " v["+this.chunkedtokens.get(i)+" ");
}else if(chunk.matches("^r\\[.*")){//t[c[{extending}] r[p[to] o[(midvalve)]]]
//chunk = chunk.replaceFirst("^r[p[", "b[v["+this.chunkedtokens.get(i)+ " "); //need to make the v is taken as a relation in processChunkVP
chunk = "t[c["+this.chunkedtokens.get(i)+"] "+chunk+"]";
}else if(chunk.startsWith("l[")){
chunk = "b[v["+this.chunkedtokens.get(i)+"] "+chunk.replaceFirst("^l\\[", "o[")+"]";
}else if(chunk.startsWith("u[")){
chunk = chunk.replaceFirst("^u[", "b[v["+this.chunkedtokens.get(i)+ "] ");
}
}
//this.chunkedtokens.set(i, chunk);
if(this.printRecover){
System.out.println("verb chunk formed: "+chunk +" for \n"+this.sentid+"["+this.sentsrc+"]"+this.markedsent);
}
for(int k = i; k<j; k++){
this.chunkedtokens.set(k, "");
}
this.chunkedtokens.set(j-1, chunk);
/*
t = t.replaceFirst("^u\\[", "").replaceFirst("\\]$", "");
String o = t.substring(t.indexOf("o[")).trim();
t = t.substring(0, t.indexOf("o[")).trim();
if(t.length()>0){
String[] states = t.split("\\s+");
for(int k = 0; k < states.length; k++){
String ch = TermOutputerUtilities.lookupCharacter(states[k], conn, characterhash, glosstable);
if(ch!=null){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+ch+"["+states[k].replaceAll("[{}]", "")+" ";
}else{
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+states[k].replaceAll("[{}]", "")+" ";
}
}
}
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+o;
}*/
}
/**
* attempts to mark modified non-subject organs as a chunk to avoid characters of these organs be attached to previous organs
* run this after recoverConjunctedOrgans to exclude organs that are objects of VP/PP-phrases)
* does not attempt to recognize conjunctions as the decisions may be context-dependent
*/
private void recoverOrgans() {
//for(int i = this.chunkedtokens.size()-1; i >=this.pointer; i--){
for(int i = this.chunkedtokens.size()-1; i >=0+0; i--){
String t = this.chunkedtokens.get(i);
if(t.endsWith(">") || t.endsWith(")")){//TODO: not dealing with nplist at this time, may be later
recoverOrgan(i);//chunk and update chunkedtokens
}
}
}
/**
*
* @param last: the index of the last part of an organ name
*/
private void recoverOrgan(int last) {
String chunk = this.chunkedtokens.get(last);
boolean foundm = false; //modifiers
boolean subjecto = false;
int i = last-1;
//for(;i >=this.pointer; i--){
for(;i >=0; i--){
String t = this.chunkedtokens.get(i);
boolean isspatial = false;
if(Utilities.isPosition(t.replaceAll("[{<>}]", ""), conn, this.glosstable)){
t = t.replaceAll("\\{", "<").replaceAll("\\}", ">").replaceAll("<+", "<").replaceAll(">+", ">");
isspatial =true;
}
/*preventing "the" from blocking the organ following ",the" to being matched as a subject organ- mohan 10/19/2011*/
if(t.matches("the|a|an")){
if(i!=0){
i=i-1;
t = this.chunkedtokens.get(i);
}
}
/*end mohan*/
if((t.matches("\\{[\\w-]+\\}") && !isspatial)|| t.matches("(\\d+)") || t.contains("~list~")){
chunk = t+" "+chunk;
foundm = true;
}else if(!foundm && (t.endsWith(">") ||t.endsWith(")") || isspatial )){ //if m o m o, collect two chunks
chunk = t+" "+chunk;
}else{
if(t.equals(","))subjecto = true;
else if((i==0 && t.matches("(a|an|the)"))){
subjecto = true;
this.chunkedtokens.set(0, ""); //remove the article
}
break;
}
}
chunk = chunk.trim();
//if(i==0) subjecto = true;
//reformat this.chunkedtokens
if(subjecto || i==-1){
chunk = "z["+chunk.trim().replaceAll("<", "(").replaceAll(">", ")")+"]";
}else{
chunk = "u["+chunk.trim().replaceFirst("[<(]", "o[(").replaceFirst("[)>]$", ")]").replaceAll("<", "(").replaceAll(">", ")").replaceAll("[{}]", "")+"]";//<leaf><blade> => u[o[(leaf)(blade)]]
}
//reset from i+2 to last
for(int j = i+1; j <last; j++){
this.chunkedtokens.set(j, "");
}
while(i>=0 && this.chunkedtokens.get(i).length()==0){
i--;
}
//if the previous nonempty chunk ends with a (), then merge this new u[] with the ()
if(i>=0 && this.chunkedtokens.get(i).matches(".*\\)\\W*\\]$")){
chunk = "("+chunk.replaceAll("(\\w+\\[|\\])", "").replaceAll(" ", ") (")+")";
chunk = chunk.replaceAll("\\(+", "(").replaceAll("\\)+", ")");
String previous = this.chunkedtokens.get(i);
String p1 = previous.substring(0, previous.lastIndexOf(")")+1);
previous = previous.replace(p1, p1+" "+chunk);
this.chunkedtokens.set(i, previous);
this.chunkedtokens.set(last, "");
}else{
//otherwise
this.chunkedtokens.set(last, chunk);
}
if(this.printRecover){
System.out.println("nsorgan chunk formed: "+chunk +" for \n"+this.sentid+"["+this.sentsrc+"]"+this.markedsent);
}
}
/**
* attempts to include broken-away conjuncted organs to pp and vb phrase
*/
private void recoverConjunctedOrgans() {
for(int i = 0; i < this.chunkedtokens.size(); i++){
String t = this.chunkedtokens.get(i);
if(this.chunkedtokens.size()>i+2){
if((t.startsWith("r[p") || t.startsWith("b[v")) &&
(this.chunkedtokens.get(i+1).matches("(and|or|plus)")||
(this.chunkedtokens.get(i+1).matches(",") && this.chunkedtokens.get(i+2).matches("(and|or|plus)")))) {//check 211
recoverConjunctedOrgans4PP(i);
}else if((t.startsWith("r[p") || t.startsWith("b[v")) && this.chunkedtokens.get(i+1).startsWith("<")){//found a broken away noun
int j = i;
String newo = "";
String o = this.chunkedtokens.get(++j);
do{
newo += o;
this.chunkedtokens.set(j, "");
o = this.chunkedtokens.get(++j);
}while (o.startsWith("<"));
String p1 = t.replaceFirst("\\]+$", "");
String p2 = t.replace(p1, "");
newo = newo.replaceAll("<", "(").replaceAll(">", ")").trim();
t = p1+" "+newo+p2;
this.chunkedtokens.set(i, "");
this.chunkedtokens.set(--j, t);
}
/*else if (t.startsWith("b[v") && this.chunkedtokens.get(i+1).matches("(and|or|plus)")){
recoverConjunctedOrgans4VB(i);
}*/
}
}
}
/**
* a {fused} l[(cleithrum) and (suprascapula)] .
*
* should be
*
* a l[{fused} (cleithrum) and (suprascapula)] .
*/
private void recoverCharacter4OrganList(){
for(int i = 0; i < this.chunkedtokens.size(); i++){
String t = this.chunkedtokens.get(i);
if(t.startsWith("l[")){
for(int j=i-1; j>=0; j--){
if(this.chunkedtokens.get(j).length()==0) continue;
if(this.chunkedtokens.get(j).endsWith("}")){
t = "l["+this.chunkedtokens.get(j)+" "+t.replaceFirst("l\\[", "");
this.chunkedtokens.set(j, "");
this.chunkedtokens.set(i, t);
}else{
j=-1; //get out of j-loop
}
}
}
}
}
/**
* recover if what follows the PP is "and|or|plus" and a (modified) organ followed by a , or a series of chunks
* @param i: the index where a PP-chunk followed by and|or|plus is found
*/
private void recoverConjunctedOrgans4PP(int i) {
String recovered = this.chunkedtokens.get(i+1)+" ";//and|or|plus
boolean foundo = false;
boolean recover = true;
int endindex = 0;
for(int j = i+2; j < this.chunkedtokens.size(); j++){
String t = this.chunkedtokens.get(j);
if(!foundo && (t.matches("\\{\\w+\\}") || t.equals(",") || t.contains("~list~"))){//states before an organ
recovered += t+" ";
}else if(t.matches("<\\w+>") || t.contains("l[")){//organ
recovered += t+" ";
endindex = j;
foundo = true;
}else if(foundo && t.matches("(,|;|\\.)")){//states before an organ
break; //organ followed by ",", should recover
}else if(foundo && t.contains("[") && !t.contains("~list~")){//found or not found organ
//do nothing
}else{
recover = false;
break;
}
}
if(recover){
//reformat: insert recovered before the last set of ]
String chunk = this.chunkedtokens.get(i);
String p1 = chunk.replaceFirst("\\]+$", "");
String p2 = chunk.replace(p1, "");
recovered = recovered.replaceAll("<", "(").replaceAll(">", ")").trim();
chunk = p1+" "+recovered+p2;
this.chunkedtokens.set(i, "");
//reset from i+1 to endindex
for(int j = i+1; j <endindex; j++){
this.chunkedtokens.set(j, "");
}
this.chunkedtokens.set(endindex, chunk);
if(this.printRecover){
System.out.println("pp/vp object chunk formed: "+chunk +" for \n"+this.sentid+"["+this.sentsrc+"]"+this.markedsent);
}
}
}
/**
* insert segment marks in chunkedtokens while producing this.chunkedsent
* after first round of segmentation, proceed to the 2nd round to disambiguate ", those of"
*/
private void segmentSent() {
int i;
for(i = this.chunkedtokens.size()-1; i>=0; i--){
String t = this.chunkedtokens.get(i);
if(t.compareTo("") !=0){
this.chunkedsent = t+" "+this.chunkedsent;;
}
if(t.indexOf('<')>=0 || t.indexOf("z[")>=0){//z[ is chunkOrgan
for(i = i-1; i>=0; i--){
String m = this.chunkedtokens.get(i);
if(m.matches(".*?\\b("+ChunkedSentence.prepositions+")\\b.*")){
this.chunkedsent = m+" "+this.chunkedsent;
break; //has prepositions before <
}
//if(m.matches("(,|;|:)") && !suspend){
if(m.matches("(,|;|:)")){
this.chunkedtokens.set(i, "SG"+m+"SG"); //insert a segment mark
this.chunkedsent = "SG"+m+"SG"+" "+this.chunkedsent;
break;
}else{
if(m.compareTo("") !=0){
this.chunkedsent = m+" "+this.chunkedsent;
}
}
}
}
}
if(this.chunkedtokens.get(this.chunkedtokens.size()-1).matches("\\W")){
this.chunkedtokens.set(this.chunkedtokens.size()-1, "SG"+this.chunkedtokens.get(this.chunkedtokens.size()-1)+"SG");
}
this.chunkedsent.trim();
disambiguateThose();
}
/**
* <corollas> {purple} , those of {sterile} <florets> � {expanded} , {exceeding} <corollas> of {fertile} <florets> , those of {fertile} <florets> 15-18 {mm} .
* <phyllaries> {many} in 6-8 <series>... , <apices> {shape~list~acute~to~acuminate} , those of {innermost} {bristly-ciliate-or-plumose} .
* find "those" instances in chunkedsent, fix chunkedsent, then fix chunkedtokens
* fix = replacing those with the subject of the last segment
*/
private void disambiguateThose() {
Pattern p = null;
if(this.chunkedsent.indexOf(" those r[p[of")>0){
//p = Pattern.compile("((?:.*?SG\\WSG.*|^)<(.*?)>.*?)those(\\s+r?\\[?p?\\[?of.*)");
p = Pattern.compile("((?:.*?SG\\WSG.*|^)(?:z\\[\\(|<)(.*?)(?:>|\\)\\]).*?)those(\\s+r?\\[?p?\\[?of.*)");
Matcher m = p.matcher(this.chunkedsent);
while(m.matches()){
String noun = m.group(2);
int indexOfthose = m.group(1).split("\\s+").length;
//in case there are to~12~cm, need to adjust indexOfthose
String textbeforethose = m.group(1);
Pattern pt = Pattern.compile("(.*?)\\b(to~\\d+~(?:"+ChunkedSentence.units+").*?)\\b(.*)");
Matcher mt = pt.matcher(textbeforethose);
while(mt.matches()){
textbeforethose = mt.group(3);
indexOfthose += mt.group(2).replaceAll("[^~]", "").length();
mt = pt.matcher(textbeforethose);
}
//update chunkedsent and chunkedtokens
//"those" may be included in a chunk
String token = this.chunkedtokens.get(indexOfthose);
if(token.compareTo("those")==0){
String temp = m.group(1).trim();
temp = temp.replaceFirst(",$", "SG,SG");
this.chunkedsent = temp+" <"+noun+">"+m.group(3);
this.chunkedtokens.set(indexOfthose, "<"+noun+">");
if(this.chunkedtokens.get(indexOfthose-1).compareTo(",")==0){
this.chunkedtokens.set(indexOfthose-1, "SG,SG");
}
}else{//in a chunk: break the chunk into two
int indexOfchunk = findChunk(indexOfthose, "those");
String chunk = this.chunkedtokens.get(indexOfchunk);
String[] two = chunk.split("\\s*those\\s*");
two[0] += " ("+noun+")";
//find how many closing brackets are needed in two[0] and form the two new chunks
int lb = two[0].replaceAll("[^\\[]", "").length();
int rb = two[0].replaceAll("[^\\]]", "").length();
for(int i = 0; i<lb-rb; i++){
two[0]+="]";
two[1] = two[1].replaceFirst("\\]$", "");
}
String newchunk = two[0]+" "+two[1];
this.chunkedsent = this.chunkedsent.replace(chunk, newchunk);
//replace the old chunk with two chunks in this.chunkedtokens
if(this.chunkedtokens.get(indexOfchunk+1).length()==0){
this.chunkedtokens.set(indexOfchunk, two[0]);
this.chunkedtokens.set(indexOfchunk+1, two[1]);
}else if(this.chunkedtokens.get(indexOfchunk-1).length()==0){
this.chunkedtokens.set(indexOfchunk-1, two[0]);
this.chunkedtokens.set(indexOfchunk, two[1]);
}
}
m = p.matcher(this.chunkedsent);
}
}
}
/**
* find the index in this.chunkedtokens that is near indexofkeyword and hold a chunk containing "keyword"
* @param indexOfkeyword
* @param keyword
* @return
*/
private int findChunk(int indexOfkeyword, String keyword) {
//search downwards
String chunk = "";
int i = indexOfkeyword;
do{
i++;
chunk = this.chunkedtokens.get(i);
}while(chunk.length()==0);
if(chunk.indexOf(keyword)>=0){
return i;
}
//search upwards
chunk = "";
i = indexOfkeyword;
do{
i--;
chunk = this.chunkedtokens.get(i);
}while(chunk.length()==0);
if(chunk.indexOf(keyword)>=0){
return i;
}
System.out.println("Wrong chunks in ChunkedSentence, System exiting.");
System.exit(1); //should never reach here
return 0;
}
/**
* l[(mid) and (distal) (cauline) {smaller}]
* ==>
* l[(mid) and (distal) (cauline)] {smaller}
*/
@SuppressWarnings("unused")
private void removeStateFromList() {
for(int i = 0; i<this.chunkedtokens.size(); i++){
String t = this.chunkedtokens.get(i);
if(t.matches("l\\[[^\\[]*?}\\]")){
String list = t.substring(0, t.lastIndexOf(")")+1).trim();
String state = t.replace(list, "").replaceFirst("\\]$", "").trim();
list= list+"]";
if(this.chunkedtokens.get(i+1).length()==0){
this.chunkedtokens.set(i, list);
this.chunkedtokens.set(i+1, state);
}else if(this.chunkedtokens.get(i-1).length()==0){
this.chunkedtokens.set(i-1, list);
this.chunkedtokens.set(i, state);
}else{
System.err.println("removeStateFromList messed up");
}
this.chunkedsent = this.chunkedsent.replace(t, list+" "+state);
}
}
}
/**
* 3] {mm}
*
*/
private void normalizeUnits(){
for(int i = 0; i<this.chunkedtokens.size(); i++){
String word = this.chunkedtokens.get(i);
if(word.matches("[<{]("+ChunkedSentence.units+")[}>]")){
if(i-1>=0){
String latest = this.chunkedtokens.get(i-1);
if(latest.matches(".*?\\d\\]+$")){
String rest = latest.replaceAll("\\]+$", "").trim();
String brackets = latest.replace(rest, "").trim();
String norm = rest+ " "+word.replaceAll("[{}<>]", "")+brackets; //mm, not {mm}
this.chunkedtokens.set(i-1, norm);
this.chunkedtokens.set(i, "");
}
}
}
}
}
/**
* shorter and wider than ...
* more/less smooth than ...
* pretty good now
*/
private int normalizeThan(){
int count = 0;
String np = "";
int thani = 0;
int firstmorei = this.chunkedtokens.size();
String more = "";
String preps = ChunkedSentence.prepositions.replaceFirst("\\bthan\\|", "").replaceFirst("\\bto\\|", "");
if(this.markedsent.indexOf("than") >=0 ){
if(this.printNormThan){
System.out.println("Need to normalize Than! "+np);
}
for(int i = 0; i<this.chunkedtokens.size(); i++){
//scan for JJRs
String token = this.chunkedtokens.get(i);
if(more.length()==0 && (token.matches(".*?\\b(\\w+er|more|less)\\b.*") && (token.indexOf("<")<0)|| this.markedsent.indexOf(token+" than")>=0)){ //<inner> is not, but <longer> than is
firstmorei = i;
if(token.matches(".*?\\bmore\\b.*")){
more = "more";
}else if(token.matches(".*?\\b\\w+er\\b.*")){
more = "er";
}
}else if(more.compareTo("er") == 0 && !token.matches(".*?\\b(\\w+er|more|less|and|or|than)\\b.*") ){
more = "";
firstmorei = this.chunkedtokens.size();;
}
if(token.matches(".*?\\bthan\\b.*")){
//needs normalization
thani = i;
if(firstmorei < thani){
//join all tokens between firstmorei and thani--this is the subject of "than"
for(int j = firstmorei; j<=thani; j++){
if(this.chunkedtokens.get(j).length()>0){
np += this.chunkedtokens.get(j)+" ";
}
this.chunkedtokens.set(j, "");
}
//scan for the object of "than"
for(i=i+1; i<this.chunkedtokens.size(); i++){
String w = this.chunkedtokens.get(i).replaceAll("(\\<|\\>|\\{|\\}|\\w+\\[|\\])", "");
//if(w.matches("\\b("+preps+"|and|or|that|which|but)\\b") || w.matches("\\W")){
if(w.matches("\\b("+preps+"|and|that|which|but)\\b") || w.matches("\\p{Punct}")){ //should allow �, n[{shorter} than] � {campanulate} <throats>
np = np.replaceAll("<", "(").replaceAll(">", ")").trim();
this.chunkedtokens.set(thani, "n["+np+"]");
count++;
break;
}else{
if(this.chunkedtokens.get(i).length()>0){
np += this.chunkedtokens.get(i)+" ";
}
this.chunkedtokens.set(i, "");
}
}
if(this.printNormThan){
System.out.println("Normalize Than! "+np);
}
thani = 0;
firstmorei = this.chunkedtokens.size();
np = "";
}
}
}
}
return count;
}
/**
* expanded to <throats>
* to 6 m.
*
*/
private int normalizeTo(){
int count = 0;
String np = "";
boolean startn = false;
//ArrayList<String> copy = (ArrayList<String>)this.chunkedtokens.clone();
for(int i = 0; i<this.chunkedtokens.size(); i++){
ArrayList<String> copy = (ArrayList<String>)this.chunkedtokens.clone();
String token = this.chunkedtokens.get(i);
if(token.compareTo("to") == 0 || token.matches(".*?\\bto]+$")){
//scan for the next organ
for(int j = i+1; j<this.chunkedtokens.size(); j++){
String t = this.chunkedtokens.get(j).trim();
if(j==i+1 && t.matches("\\d[^a-z]*")){//match "to 6[-9]" ; not match "to 5-lobed"
copy = formRangeMeasure(i);
break;
}
if(startn && t.indexOf('<')<0){
break;
}
//to b[v[expose] o[(stigma)]]
if(t.matches("[,:;\\d]") || t.matches(".*?\\b[pv]\\[.*") ||t.matches(".*?\\b("+ChunkedSentence.prepositions+"|and|or|that|which|but)\\b.*")){
break;
}
np +=t+" ";
this.chunkedtokens.set(j, "");
if(t.lastIndexOf(' ') >=0){
t = t.substring(t.lastIndexOf(' ')); //last word there
}
if(t.indexOf('<')>=0 || t.indexOf('(')>=0){ //t may have []<>{}
startn = true; //not break yet, may be the next token is a noun
}
}
if(!startn){
this.chunkedtokens = copy; //not finding the organ, reset
}else{
if(this.printNormTo){
System.out.println("To needs normalization!");
}
np = "to "+np;
//scan forward for the start of the chunk
boolean startc = false; //find the start of the chunk
for(int j = i-1; j>=0; j--){
String t = this.chunkedtokens.get(j);
if(t.matches(".*?\\b("+ChunkedSentence.prepositions+"|and|or|that|which|but)\\b.*") || t.matches(".*?[>;,:].*") ||(t.matches("^\\w+\\[.*") && j!=i-1) ){ //the last condition is to avoid nested chunks. cannot immediately before w[].e.g: b[v[{placed}] o[{close}]] w[to {posterior} (shell) (margin)] ;
np = np.replaceAll("<", "(").replaceAll(">", ")").replaceAll("\\s+", " ").trim();
//np = np.replaceAll("\\s+", " ").trim();
this.chunkedtokens.set(i, "w["+np+"]"); //replace "to" with np
count++;
startn = false;
startc = true;
if(this.printNormTo){
System.out.println("!normalizedTo! "+np);
}
break;
}else{
np = t+" "+np;
this.chunkedtokens.set(j, "");
}
}
if(!startc){
this.chunkedtokens = copy; //not finding the start of the chunk, reset
}
}
}
}
return count;
}
/**
* form a chunk if a pattern "to # unit" is found starting from i
* @param i: index of "to", which is followed by a number
* @return this.chunkedtokens
*/
private ArrayList<String> formRangeMeasure(int i) {
String chunk = "to~"+this.chunkedtokens.get(i+1)+"~"; //"to"
if(this.chunkedtokens.size()>i+2){
String unit = this.chunkedtokens.get(i+2).replaceAll("\\W", " ").trim();
if(unit.matches("("+ChunkedSentence.units+")")){
chunk += unit;
this.chunkedtokens.set(i+2, chunk);
this.chunkedtokens.set(i+1, "");
if(this.chunkedtokens.get(i).equals("to")){
this.chunkedtokens.set(i, "");
}else{
this.chunkedtokens.set(i, this.chunkedtokens.get(i).replaceFirst("\\s+to(?=\\W+$)", ""));
}
}
}
return this.chunkedtokens;
}
/**between 5 and 10
* between the frontal and the sphenotic spine
* between anterior supraneural bone and neural spine of vertebra 4
* between neural arches of vertebrae 3 and 4
* @return
* 5-10
* r[p[between] o[the frontal and the sphenotic spine]]
* r[p[between] o[anterior supraneural bone and neural spine]] of vertebra 4
* r[p[between] o[neural arches]] of vertebrae 3 and 4
*
*
* what about:
* 948[Armbruster_2004.xml_ffbaa153-5288-4671-866c-33d14c78c44e.txt-0]:
* <space> r[p[between] o[{posterior} (process)]] r[p[of] o[(coracoid) (strut) and {posterior} (process)]] r[p[of] o[(coracoid)]]
*/
private int normalizeBetween(){
int count = 0;
for(int i = 0; i<this.chunkedtokens.size(); i++){
String token = this.chunkedtokens.get(i);
if(token.matches(".*?\\bbetween\\b.*")){//between
if(this.printNorm){
System.out.println(token+" needs normalization!");
}
if(token.matches("r\\[.*? and .*?\\]")){//already a chunk, fix the format: r[p[between] the {frontal} and o[the (sphenotic) ({spine})]]
token = token.replaceFirst("o\\[", "").replaceFirst("\\]\\s*", "] o[");
this.chunkedtokens.set(i, token);
return ++count;
}
String chara = Utilities.lookupCharacter(this.chunkedtokens.get(i+1).replaceAll("[<>(){}\\]\\[]", ""), conn, characterhash, this.glosstable, this.tableprefix);
if(this.chunkedtokens.get(i+1).matches("\\d+.*") || (chara!=null && chara.compareToIgnoreCase("structure")!=0)){
//deal with "between 5 and 10" => "5-10"
//between red and purple => red to purple
count += normalizeBetweenCharacters(i);
}else{
//find the nearest "and" that is not separated from "between" by any stopwords or puncts
//if such "and" can not be found, find the nearest pl structure terms
count += normalizeBetweenStructures(i);
}
}
}
return count;
}
private int normalizeBetweenCharacters(int prepindex) {
// TODO Auto-generated method stub
return 0;
}
/**normalize one instance of "between"
* find the nearest "and" that is not separated from "between" by any stopwords or puncts
* if such "and" can not be found, find the nearest pl structure terms
* @param the index for the prep (between) and it is the starting point for the search in chunkedtoken
* @return
* r[p[between] o[the frontal and the sphenotic spine]]
* r[p[between] o[anterior supraneural bone and neural spine]] of vertebra 4
* r[p[between] o[neural arches]] of vertebrae 3 and 4
*/
private int normalizeBetweenStructures(int prepindex) {
int nearestN1 = 0;
int nearestN2 = 0;
int nearestAND = 0;
for(int i = prepindex+1; i < this.chunkedtokens.size(); i++){
String token = this.chunkedtokens.get(i);
if(nearestAND ==0 && (token.matches(".*?(\\b("+ChunkedSentence.prepositions+")\\b|,|\\.).*"))){
//failed to find "and", make the chunk stop by nearestN1
//check this before checking for < or (
return makeChunk4Between(prepindex, nearestN1);
}
if(nearestAND == 0 && ((token.contains("<") || token.contains("(")))){
nearestN1 = i;
}
if(nearestAND == 0 && (token.compareToIgnoreCase("and")==0)){
nearestAND = i;
}
if(nearestAND > 0 && ((token.contains("<") || token.contains("(")))){
nearestN2 = i;
}
if(nearestAND > 0 && nearestN2>0 && !token.contains("<") && !token.contains("(")){
//find the 2nd organ, make the chunk
return makeChunk4Between(prepindex, nearestN2);
}
if(nearestAND > 0 && (token.matches(".*?(\\b("+ChunkedSentence.prepositions+")\\b|,|\\.).*"))){
//failed to find nearestN2, make the chunk stop now
return makeChunk4Between(prepindex, i-1);
}
}
return 0;
}
/**
* form a chunk using all tokens from prepindex to endindex
* reset all these tokens in chunkedtokens
* put the chunk at the prepindex
* @param prepindex
* @param endindex
*/
private int makeChunk4Between(int prepindex, int endindex) {
//String chunk = "r[p["+this.chunkedtokens.get(prepindex)+"] o[";
if(endindex <= prepindex) return 0;
String chunk = "";
for(int i = prepindex+1; i<=endindex; i++){
String t = this.chunkedtokens.get(i);
t = t.contains("<") || Utilities.isPosition(t.replaceAll("[{}]", ""), conn, this.glosstable)? "("+t.replaceAll("[<>(){}]", "")+")": t;
chunk +=t+" ";
this.chunkedtokens.set(i, "");
}
if(this.chunkedtokens.get(prepindex).contains("[between]")){
chunk = this.chunkedtokens.get(prepindex).replaceAll("\\]+$", " ")+chunk.trim()+"]]";
}else{ //bare word between
chunk = "r[p["+this.chunkedtokens.get(prepindex)+"] o["+chunk.trim()+"]]";
}
this.chunkedtokens.set(prepindex, chunk);
return 1;
}
/**
* most [of] lengths
* [in] zyz arrays
*/
private int normalizeOtherINs(){
//boolean startn = false;
int count = 0;
String preps = ChunkedSentence.prepositions.replaceAll("\\b(than|to|between)\\|", "");
for(int i = 0; i<this.chunkedtokens.size(); i++){
String token = this.chunkedtokens.get(i);
if(token.matches(".*?p\\[\\{?[a-z]+\\}?\\]+") || token.matches(".*?\\b("+preps+")\\b\\]*$") ||
token.matches(".*?\\b(as-.*?-as|same-.*?-as|\\w+-to|in-.*?-(with|to))\\b.*?")){//[of] ...onto]]
token = token.replaceAll("[{}]", "");
if(this.printNorm){
System.out.println(token+" needs normalization!");
}
// a prep is identified, needs normalization
ArrayList<String> copy = (ArrayList<String>)this.chunkedtokens.clone();
//String nscopy = null;
String npcopy = null;
ArrayList<String> ctcopy = null;
boolean startn = false;
String np = "";
//String ns = "";
boolean foundorgan = false;
//boolean ofnumber = false;
//lookforward in chunkedtokens to find the object noun
int j = 0;
for(j = i+1; j<this.chunkedtokens.size(); j++){
String t = this.chunkedtokens.get(j).trim();
if(j==i+1 && t.matches("^[,;\\.]")){//"smooth throughout, ", but what about "smooth throughout OR hairy basally"?
if(this.printNorm){
System.out.println("encounter ',' immediately, no object is expected");
}
break;
}
/*if(t.startsWith("r[p[") && !np.matches(".*?\\b(or|and)\\b\\s+$")){
npcopy = np;//TODO: 4/14/2011 check out 501.txt-4, 502.txt-5 "after flowering, 10 cm in fruit" 512.txt-11 "differing from inner, highly variable in <color>"
break;
}*/
if(!foundorgan && startn && t.indexOf('<')<0 && t.indexOf('(')<0 && !Utilities.isNoun(t, nouns, notnouns)){ //test whole t, not the last word once a noun has been found
//save ns for now, but keep looking for organs
//nscopy = nscopy == null ? ns : nscopy; //keep only the first copy
npcopy = npcopy == null? np : npcopy;
ctcopy = ctcopy == null? (ArrayList<String>)this.chunkedtokens.clone():ctcopy;
}
//if(startn && !foundorgan && ishardstop(j)){
if(!foundorgan && ishardstop(j)){
//hard stop encountered, break
//ns = nscopy;
if(npcopy!=null && ctcopy!=null){
np = npcopy;
this.chunkedtokens = ctcopy;
}
break;
}
if(foundorgan && t.indexOf('<')<0 && t.indexOf('(')<0){ //test whole t, not the last word once a noun has been found
break; //break, the end of the search is reached, found organ as object
}
np +=t+" "; //any word in betweens
this.chunkedtokens.set(j, "");
if(t.indexOf('<')>=0 ||t.indexOf('(')>=0){ //t may have []<>{}
startn = true; //not break yet, may be the next token is also a noun
foundorgan = true;
}
if(!foundorgan && Utilities.isNoun(t, nouns, notnouns)){ //t may have []<>{}
startn = true; //won't affect the value of foundorgan, after foundorgan is true, "plus" problem
if(TermOutputerUtilities.isPlural(t)){
foundorgan = true;
np = np.trim();
if(np.lastIndexOf(" ")>0){
np = np.substring(0, np.lastIndexOf(" "))+" "+ "("+t.replaceAll("\\W", "")+") ";
}else{
np = "("+np.replaceAll("\\W", "")+") ";
}
}
}
}
/*
for(int j = i+1; j<this.chunkedtokens.size(); j++){
String t = this.chunkedtokens.get(j).trim();
if(startn && t.indexOf('<')<0 && !TermOutputerUtilities.isNoun(t, nouns)){ //test whole t, not the last word once a noun has been found
break; //break, the end of the search is reached
}
np +=t+" ";
this.chunkedtokens.set(j, "");
if(t.indexOf('<')>=0 ||t.indexOf('(')>=0 || TermOutputerUtilities.isNoun(t, nouns)){ //t may have []<>{}
startn = true; //not break yet, may be the next token is a noun
ns += t+" ";
}
}
*/
//form the normalized chunk
if(foundorgan || npcopy!= null /*|| ofnumber*/){
//ns = ns.trim();
//if(!ns.endsWith("]")){ //not already a chunk
//np = np.replace(ns, "").trim();
//ns = "("+ns.replaceAll("[{(<>)}]", "").replaceAll("\\s+", ") (")+")"; //mark the object as organ word by word
//np = (np.replaceAll("<", "(").replaceAll(">", ")")+" "+ns).trim();
np = np.replaceAll("<", "(").replaceAll(">", ")").replaceAll("\\s+", " ").trim();
//}
String symbol = "o";
/*if(ofnumber){
symbol = "c";
}*/
if(token.indexOf('[')>=0){
String rest = token.replaceFirst("\\]+$", "").trim();
String brackets = token.replace(rest, "").replaceFirst("\\]$", "").trim();
token = rest + "] "+symbol+"["+np.trim()+"]"+brackets;
this.chunkedtokens.set(i, token);
if(this.printNorm){
System.out.println("!normalized!: "+token);
}
}else{//without [], one word per token
token = "r[p["+token+"] "+symbol+"["+np.trim()+"]]";
this.chunkedtokens.set(i, token);
if(this.printNorm){
System.out.println("!normalized!: "+token);
}
}
count++;
}else{
if(j-i==1){
//cancel the normalization attempt on this prep, return to the original chunkedtokens
this.chunkedtokens = copy;
}else if(np.matches(".*? [\\d+%]$")){//reached the end of the sentence.This is the case for "plumose on distal 80 % ."?
//also the same width dorsally as proximally
this.chunkedtokens = copy;
//np = np.replaceAll("\\s+", " ").trim();
String head = token.replaceFirst("\\]+$", "").trim();
String brackets = token.replace(head, "").replaceFirst("\\]$", "").trim();
String rest = np.replaceFirst(".*?(?=(\\.|;|,|\\band\\b|\\bor\\b|\\w\\[))", "").trim();
np = np.replace(rest, ""); //perserve spaces for later
String object = np.replaceAll("\\s+", " ").trim();
if(object.length()>0){
token = head + "] o["+np.replaceAll("\\s+", " ").trim()+"]"+brackets;
this.chunkedtokens.set(i, token);
int npsize = np.split("\\s").length; //split on single space to perserve correct count of tokens
for(int k = i+1; k<=i+npsize; k++){
this.chunkedtokens.set(k, "");
}
if(this.printNorm){
System.out.println("!default normalized to (.|;|,|and|or|r[)!: "+token);
}
count++;
}
}else{
//cancel the normalization attempt on this prep, return to the original chunkedtokens
this.chunkedtokens = copy;
}
}
}
//i=i+1;
}
/*if(!startn){
this.chunkedtokens = copy;
}*/
return count;
}
private boolean ishardstop(int j) {
String t1 = this.chunkedtokens.get(j).trim();
if(t1.matches("^\\w\\[.*")){
return true;
}
if(t1.startsWith(".")){
return true;
}
if(this.chunkedtokens.size()==j+1){
return true;
}
String t2 = this.chunkedtokens.get(j+1).trim();
if(t1.startsWith(",") && t2.matches("^\\W*[<(].*")){
return true;
}
return false;
}
public String toString(){
return this.chunkedsent;
}
public int getPointer(){
return this.pointer;
}
//end mohan code
public void setInSegment(boolean yes){
this.inSegment = yes;
}
public void setRightAfterSubject(boolean yes){
this.rightAfterSubject = yes;
}
public boolean hasNext(){
if(pointer <this.chunkedtokens.size()){
return true;
}
return false;
}
public int getSize(){
return this.chunkedtokens.size();
}
public Chunk nextChunk(){
Chunk ck = getNextChunk();
while(ck==null && this.hasNext()){
ck=this.getNextChunk();
}
if(ck instanceof ChunkOrgan){
this.rightAfterSubject = true;
}else{
this.rightAfterSubject = false;
}
return ck==null? new ChunkEOS(".") : ck;
}
/**
* returns the next Chunk: may be a
* Organ, Value, Comparative Value, SimpleCharacterState, Subclause,
* PrepChunk, IVerbChunk (Intransitive verb chunk, followed by a preposition), VerbChunk, ADJChunk
* @return
*/
@SuppressWarnings("rawtypes")
public Chunk getNextChunk(){
Chunk chunk = null;
String token = this.chunkedtokens.get(pointer);////a token may be a word or a chunk of text
while(token.trim().length()==0){
pointer++;
token = this.chunkedtokens.get(pointer);
}
token = token.compareTo("�")==0? "moreorless" : token;
token = token.matches(".*?\\d.*")? NumericalHandler.originalNumForm(token) : token;
if(token.contains("relative~")){
pointer++;
return new ChunkCharacterComparison(token);
}
//all tokens:
//number:
//if(token.matches(".*?\\d+$")){ //ends with a number
if(NumericalHandler.isNumerical(token) ||token.matches("^to~\\d.*")|| token.matches("h\\s*\\W\\s*w")|| token.matches("l\\s*\\W\\s*w")){//l-w or l/w
chunk = getNextNumerics();//pointer++;
if(this.unassignedmodifier != null){
chunk.setText(this.unassignedmodifier+ " "+chunk.toString());
}
return chunk;
}
if(token.indexOf("�")>0 && token.length()>0 && token.indexOf(" ")<0){
//token: 4-9cm�usually15-25mm
String[] dim = token.split("�");
boolean isArea = true;
int c = 0;
for(int i = 0; i<dim.length; i++){
isArea = dim[i].matches(".*?\\d.*") && isArea;
c++;
}
if(isArea && c>=2){
token = token.replaceAll("�[^0-9]*", " � ").replaceAll("(?<=[^a-z])(?=[a-z])", " ").replaceAll("(?<=[a-z])(?=[^a-z])", " ").replaceAll("\\s+", " ").trim();
chunk = new ChunkArea(token);
pointer++;
return chunk;
}
}
if(token.indexOf("=")>0){//chromosome count 2n=, FNA specific
String l = "";
String t= this.chunkedtokens.get(pointer++);
while(t.indexOf("SG")<0){
l +=t+" ";
t= this.chunkedtokens.get(pointer++);
}
l = l.replaceFirst("\\d[xn]=", "").trim();
chunk = new ChunkChrom(l);
return chunk;
}
//create a new ChunkedSentence object for bracketed text
if(token.startsWith("-LRB-/-LRB-")){
ArrayList<String> tokens = new ArrayList<String>();
String text = "";
if(token.indexOf("-RRB-/-RRB-")<0){
String t = this.chunkedtokens.get(++this.pointer);
while(!t.endsWith("-RRB-/-RRB-")){
tokens.add(t);
text += t+ " ";
if(this.pointer+1 < this.chunkedtokens.size()) t = this.chunkedtokens.get(++this.pointer); //missing RRB
else break;
}
}
text=text.trim();
if(text.length()>0){ //when -LRB- and -RRB- are on the same line, text="" for example, as in -LRB-/-LRB-3--RRB-/-RRB-5-{merous} (3-)5-{merous}
this.pointer++;
if(!text.matches(".*?[,;\\.:]$")){
text +=" .";
tokens.add(".");
}
Chunk c = new ChunkBracketed(text);
c.setChunkedTokens(tokens);
return c;
} //else, continue on
}
//create a new ChunkedSentence object
if(token.startsWith("s[")){
ArrayList<String> tokens = new ArrayList<String>();
String text = token.replaceFirst("s\\[", "").replaceFirst("\\]$", "");
//break text into correct tokens: s[that is {often} {concealed} r[p[by] o[(trichomes)]]] ;
tokens = Utilities.breakText(text);
this.pointer++;
text=text.trim();
if(!text.matches(".*?[,;\\.:]$")){
text +=" .";
tokens.add(".");
}
Chunk c = new ChunkSBAR(text);
c.setChunkedTokens(tokens);
return c;
}
if(token.matches("\\W") ){//treat L/RRBs as either , or null
pointer++;
this.unassignedmodifier = null;
return new ChunkComma("");
}
if(token.matches("\\b(and|either)\\b")){
pointer++;
this.unassignedmodifier = null;
return null;
}
//end of a segment
if(token.matches("SG[;:\\.]SG")){
this.inSegment = false;
pointer++;
//this.unassignedmodifier = null;
return new ChunkEOL(""); //end of line/statement
}
if(token.matches("SG,SG")){
this.inSegment = false;
pointer++;
this.unassignedmodifier = null;
return new ChunkEOS("");//end of segment/substence
}
//start of a segment
if(!this.inSegment){
this.inSegment = true;
chunk = getNextOrgan();//pointer++
if(chunk != null){
this.unassignedmodifier = null;
return chunk;
}
}
//all chunks
if(token.matches("^\\w+\\[.*")){
String type = chunkType(pointer);
token = this.chunkedtokens.get(pointer); //as checkType may have reformatted token.
try{
if(type != null){
Class c = Class.forName("fna.charactermarkup."+type);
Constructor cons = c.getConstructor(String.class);
pointer++;
//deal with any unassignedmodifier when EOS is approached.
//if(this.unassignedmodifier != null && this.chunkedtokens.get(pointer).matches("(SG)?\\W(SG)?")){
if(this.unassignedmodifier != null){ //did not see why the 2nd condition is needed. Here, assuming any unassigned modifier should be applied to the next valid chunk
token = token.replaceFirst("\\[", "["+this.unassignedmodifier+" ");
this.unassignedmodifier = null;
}
return (Chunk)cons.newInstance(token.trim());
}else{//if the chunk is not correctly formatted. Forward pointer to the next comma.
//forward pointer to after the next [;:,.]
if(this.printExp){
System.out.println("PP without a Noun: "+token);
}
pointer++;
/*String t = "";
do{
if(this.pointer < this.chunkedtokens.size()){
t = this.chunkedtokens.get(this.pointer++);
}else{
break;
}
}while (!t.matches("[,;:\\.]"));*/
return null;
}
}catch(Exception e){
e.printStackTrace();
}
}
//OR:
if(token.compareTo("or") == 0){
this.pointer++;
return new ChunkOR("or");
}
//text:
chunk = composeChunk();
return chunk;
}
@SuppressWarnings("rawtypes")
private Chunk composeChunk() {
Chunk chunk;
String token;
String scs = "";
String role = "";
boolean foundo = false;//found organ
boolean founds = false;//found state
if(this.unassignedmodifier != null){
scs =(scs.trim().length()>0? scs.trim()+"] ": "")+"m["+this.unassignedmodifier.replaceAll("[{}]", "")+" ";
this.unassignedmodifier = null;
}
int i = 0;
for(i = this.pointer; i<this.chunkedtokens.size(); i++){
token = this.chunkedtokens.get(i);
/* if one of the tokens match those in the stop list but not in skip list, skip it and get the next token- mohan 10/19/2011*/
if(token.matches("("+stop+")") && !token.matches("("+skip+")")){
i=i+1;
token = this.chunkedtokens.get(i);
}
/*end mohan 10/19/2011*/
token = token.matches(".*?\\d.*")? NumericalHandler.originalNumForm(token):token;
if(token.length()==0){
continue;
}
//token = NumericalHandler.originalNumForm(token); //turn -LRB-/-LRB-2
if(token.matches("^\\w+\\[.*")){
//modifier + a chunk: m[usually] n[size[{shorter}] constraint[than or {equaling} (phyllaries)]]
//if(scs.matches("\\w{2,}\\[.*") && token.matches("\\w{2,}\\[.*")){ // scs: position[{adaxial}] token: pubescence[{pubescence~list~glabrous~or~villous}]
if(scs.matches(".*?\\bo\\[\\w+\\s.*")){
pointer = i;
scs = scs.replaceAll("o\\[", "o[(").trim()+")]";
return new ChunkNonSubjectOrgan("u["+scs+"]");
}else if(scs.matches(".*?\\w{2,}\\[.*")){
pointer = i;
return new ChunkSimpleCharacterState("a["+scs.trim()+"]]");
}else {
String type = chunkType(i); //changed from pointer to i
token = this.chunkedtokens.get(i);
token = token.matches(".*?\\d.*")? NumericalHandler.originalNumForm(token):token;
scs = scs.trim().length()>0? scs.trim()+"] " : ""; //modifier
String start = token.substring(0, token.indexOf("[")+1); //becomes n[m[usually] size[{shorter}] constraint[than or {equaling} (phyllaries)]]
String end = token.substring(start.length());
token = start+scs+end;
try{
if(type !=null){//r[p[as]] without o[]
Class c = Class.forName("fna.charactermarkup."+type);
Constructor cons = c.getConstructor(String.class);
pointer = i+1;
return (Chunk)cons.newInstance(token.trim());
}else{ //parsing failure, continue with the next chunk
pointer = i+1;
return null;
}
}catch(Exception e){
e.printStackTrace();
}
}
}
role = token.charAt(0)+"";
token = token.replaceAll("[<>{}]", "");
//<roots> {usually} <taproots> , {sometimes} {fibrous}.
String symbol= this.rightAfterSubject? "type" : "o";
if(!foundo && role.compareTo("<")==0){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+symbol+"["+token+" ";
foundo = true;
}else if(foundo && role.compareTo("<")==0){
scs += token+" ";
}else if(foundo && role.compareTo("<") !=0){
this.pointer = i;
scs = scs.replaceFirst("^\\]\\s+", "").replaceFirst(symbol+"\\[", "###[").replaceAll("\\w+\\[", "m[").replaceAll("###\\[", symbol+"[").trim()+"]"; //change all non-type character to modifier: <Inflorescences> {indeterminate} <heads>
if(!this.rightAfterSubject){
//reformat m[] o[] o[] to m[] o[()] o[()]
String m = scs.substring(0, scs.indexOf("o["));
String o = scs.substring(scs.indexOf("o[")).replaceAll("\\[", "[(").replaceAll("\\]", ")]");
scs = m+o;
}
return this.rightAfterSubject? new ChunkSimpleCharacterState("a["+scs+"]") : new ChunkNonSubjectOrgan("u["+scs+"]"); //must have type[ or o[
}
if(token.matches(".*?"+NumericalHandler.numberpattern+"$") || token.matches("\\d+\\+?") || token.matches("^to~\\d.*")){ //0. sentence ends with a number, the . is not separated by a space
if(scs.matches(".*?\\w{2,}\\[.*")){//must have character[
pointer=i;
scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
}else{
pointer=i;
chunk = getNextNumerics();
if(chunk!=null){
if(scs.length()>0){
scs = scs.replaceFirst("^\\]", "").trim()+"] "+chunk.toString();
}else{
scs = chunk.toString();
}
chunk.setText(scs);
return chunk;
}else{
pointer++;
return chunk; //return null, skip this token: parsing failure
}
}
}
//add to a state chunk until a) a preposition b) a punct mark or c)another state is encountered
if(role.compareTo("<") !=0 && true){
String chara = Utilities.lookupCharacter(token, conn, characterhash, glosstable, tableprefix);
if(chara==null && Utilities.isAdv(token, adverbs, notadverbs)){
scs = scs.trim().length()>0? scs.trim()+ "] m["+token+" " : "m["+token;
}else if(token.matches(".*[,;:\\.\\[].*") || token.matches("\\b("+ChunkedSentence.prepositions+"|or|and)\\b") || token.compareTo("-LRB-/-LRB-")==0){
this.pointer = i;
if(scs.matches(".*?\\w{2,}\\[.*")){//must have character[
scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
}else{
if(scs.indexOf("m[")>=0){
this.unassignedmodifier = "{"+scs.trim().replaceAll("(m\\[|\\])", "").replaceAll("\\s+", "} {")+"}";
}
if(this.pastpointers.contains(i+"")){
this.pointer = i+1;
}else{
this.pastpointers.add(i+"");
}
//if(token.matches("SG.SG")) return new ChunkEOS("");
return null;
}
}else{
//String chara = TermOutputerUtilities.lookupCharacter(token, conn, characterhash, glosstable, tableprefix);
if(!founds && chara!=null){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+chara+"["+token+" ";
founds = true;
if(i+1==this.chunkedtokens.size()){ //reach the end of chunkedtokens
scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
this.pointer = i+1;
return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
}
}else if(founds && chara!=null && scs.matches(".*?"+chara+"\\[.*")){ //coloration coloration: dark blue
scs += token+" ";
}else if(founds){
//By Zilong
/*orig:a[{more} ventrally] a[{directed}]*/
/*should be:a[more ventrally directed] */
if(scs.matches("^comparison\\[more\\]\\s+m\\[\\w+\\s+$")){
//now it only handles the simplest case, only consider "more"
this.pointer = i+1;
scs = scs.replaceFirst("comparison\\[", "m\\[");
scs = scs.replaceFirst("\\] m\\[", " ").trim()+"] ";
scs += chara+"["+token+" ";
}else{
//By Zilong End
this.pointer = i;
}
scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
}else if(chara==null){
if(Utilities.isVerb(token, verbs, notverbs) && !founds){//construct ChunkVP or ChunkCHPP
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"v["+token+" ";
//continue searching for either a <> or a r[]
boolean findc = false; //find a chunk
boolean findo = false; //find an organ
boolean findm = false; //find a modifier
boolean findt = false; //find a text token
for(int j = i+1; j < this.chunkedtokens.size(); j++){
String t = this.chunkedtokens.get(j).trim();
if(t.length() == 0){continue;}
if(t.startsWith("u[")){//form a vb chunk
t = t.replaceFirst("^u\\[", "").replaceFirst("\\]$", "");
String o = t.substring(t.indexOf("o[")).trim();
t = t.substring(0, t.indexOf("o[")).trim();
if(t.length()>0){
String[] states = t.split("\\s+");
for(int k = 0; k < states.length; k++){
String ch = Utilities.lookupCharacter(states[k], conn, characterhash, glosstable, tableprefix);
if(ch!=null){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+ch+"["+states[k].replaceAll("[{}]", "")+" ";
}else{
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+states[k].replaceAll("[{}]", "")+" ";
}
}
}
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+o;
this.pointer = j+1;
return new ChunkVP("b["+scs+"]");
}
String ch = Utilities.lookupCharacter(t, conn, characterhash, glosstable, tableprefix);
if((!findc &&!findo) && t.matches("^[rwl]\\[.*")){
scs = scs.replaceFirst("^\\]\\s+", "").trim()+"] ";
scs += t;
findc = true;
}else if(!findo && t.indexOf("<")>=0){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"o["+t.replace("<", "(").replace(">", ")").replaceAll("[{}]", "")+" ";
findo = true;
}else if(!findo && !findc && ch!=null){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+ch+"["+t.replaceAll("[{}]", "")+" ";
}else if(!findo && !findc && !findm && Utilities.isAdv(t, adverbs, notadverbs)){
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+t.replaceAll("[{}]", "")+" ";
findm = true;
}else if(!findo && !findc && findm && Utilities.isAdv(t, adverbs, notadverbs)){
scs += t.replaceAll("[{}]", "")+" ";
}else if(findo && t.indexOf("<")>=0){
scs += t.replace("<", "(").replace(">", ")").replaceAll("[{}]", "")+" ";
}else if((findo || findc) && t.indexOf("<")<0){ //must have foundo or foundc
this.pointer = j;
if(findo){scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";}
if(scs.indexOf("p[")>=0){
return new ChunkCHPP("t["+scs.replace("v[", "c[")+"]");
}else{
scs = scs.replace("l[", "o[");
if(scs.matches(".*?\\bv\\[[^\\[]* m\\[.*")){//v[comprising] m[a] architecture[surrounding] o[(involucre)]
scs = format(scs);
//scs = scs.replaceFirst("\\] o\\[", " ").replaceFirst("\\] m\\[", "] o[");
}else if(scs.matches(".*?\\bv\\[[^\\[]* \\w{2,}\\[.*")){//v[comprising] architecture[surrounding]
scs = format(scs);
//scs = scs.replaceFirst("\\] o\\[", " ").replaceFirst("\\] \\w{2,}\\[", "] o[");
}
return new ChunkVP("b["+scs+"]");
}
}else if(t.matches(".*?\\W.*") || t.matches("\\b("+ChunkedSentence.prepositions+"|or|and)\\b") || t.compareTo("-LRB-/-LRB-")==0){
if(scs.matches(".*?\\w{2,}\\[.*")){ //borne {singly
this.pointer = j;
scs = (scs.replaceFirst("^\\]", "").trim()+"]").replaceFirst("\\bv\\[[^\\[]*?\\]\\s*", "");
return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
}else{
//search failed
if(this.pastpointers.contains(i+"")){
this.pointer = i+1;
}else{
this.pointer = i;
this.pastpointers.add(i+"");
}
return null;
}
}else if(!findt){ //usually v[comprising] m[a {surrounding}] o[involucre]
scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+t+" "; //taking modifiers
findt = true;
}else if(findt){
scs += t+" ";
}
}
}else{
scs = "";
}
}
}
}
}
if(i==this.chunkedtokens.size()){
this.pointer = this.chunkedtokens.size();
}
return null;
}
/**
*
* @return e.g. 3 cm, what about "3 cm to 10 dm"?
* also 3 times (... longer than, as wide as ...)
*/
/*private Chunk getNextBroken() {
String result = "";
String type = "";
boolean found = false;
for(int i = pointer; i<this.chunkedtokens.size(); i++){
if(this.chunkedtokens.get(i).matches(".*?-")){ //ends with a hyphen
result += this.chunkedtokens.get(i)+ " ";
found = true;
type = checkType(i);
}
if(found){
result += this.chunkedtokens.get(i)+ " ";
pointer = i+1;
try{
if(type != null){
Class c = Class.forName(type);
Constructor cons = c.getConstructor(String.class);
return (Chunk)cons.newInstance(result.replaceAll("[<>]", "").trim());
}else{
return new SimpleCharacterState(result.replaceAll("[<>]", "").trim());
}
}catch(Exception e){
e.printStackTrace();
}
}
}
return null;
}*/
/**
* m[usually] v[comprising] m[a] architecture[surrounding] o[(involucre)]
*
* m[usually] v[comprising] o[1 architecture[surrounding] (involucre)]
*/
private String format(String scs) {
String first = scs.substring(0, scs.indexOf("v["));
String rest = scs.replace(first, "");
String v = rest.substring(0, rest.indexOf(']')+1+0);
String o = rest.replace(v, "").trim(); //m[a] architecture[surrounding] o[(involucre)]
String newo = "o[";
do{
String t = o.indexOf(' ')>=0? o.substring(0, o.indexOf(' ')) : o;
o = o.replaceFirst(t.replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]").replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)"),"").trim();
if(t.startsWith("m[")){
t = t.replaceAll("(m\\[|\\])", "").trim();
if(t.compareTo("a") == 0 && !o.matches("(couple|few)")){
t = "1";
}
}
if(t.startsWith("o[")){
t=t.replaceAll("(o\\[|\\])", "").trim();
}
newo+=t+" ";
}while(o.length()>0);
return first+v+" "+newo.trim()+"]";
}
/**
* TODO: deal with LRB-/-LRB
* @return e.g. 3 cm, what about "3 cm to 10 dm"?
* also 3 times (... longer than, as wide as ...)
*/
private Chunk getNextNumerics() {
String numerics = "";
String t = this.chunkedtokens.get(this.pointer);
t = NumericalHandler.originalNumForm(t).replaceAll("\\?", "");
if(t.matches("^to~\\d.*")){
this.pointer++;
return new ChunkValue(t.replaceAll("~", " ").trim());
}
/*if(t.matches(".*?("+ChunkedSentence.percentage+")")){ //10percent won't work because it won't be seen as a numerical value in the first place
numerics += t+ " ";
pointer++;
return new ChunkValuePercentage(numerics.trim());
}
if(t.matches(".*?("+ChunkedSentence.degree+")")){
numerics += t+ " ";
pointer++;
return new ChunkValueDegree(numerics.trim());
}*/
if(t.matches(".*?[()\\[\\]\\-\\�\\d\\.�\\+���/�\\*/%]*?[�/�\\d][()\\[\\]\\-\\�\\d\\.�\\+���/�\\*/%]*(-\\s*("+ChunkedSentence.counts+")\\b|$)")){ //ends with a number
numerics += t+ " ";
pointer++;
if(pointer==this.chunkedtokens.size()){
return new ChunkCount(numerics.replaceAll("[{()}]", "").trim());
}
t = this.chunkedtokens.get(this.pointer);//read next token
if(t.matches("^[{<(]*("+ChunkedSentence.percentage+").*")){
numerics += t+ " ";
pointer++;
return new ChunkValuePercentage(numerics.replaceAll("[{(<>)}]", "").trim());
}
if(t.matches("^[{<(]*("+ChunkedSentence.degree+")\\b.*")){
numerics += t+ " ";
pointer++;
return new ChunkValueDegree(numerics.replaceAll("[{(<>)}]", "").trim());
}
if(t.matches("^[{<(]*("+ChunkedSentence.units+")\\b.*?")){
numerics += t+ " ";
pointer++;
adjustPointer4Dot(pointer);//in bhl, 10 cm . long, should skip the ". long" after the unit
numerics = numerics.replaceAll("[{(<>)}]", "").trim();
if(numerics.contains("�")){
return new ChunkArea(numerics);
}
return new ChunkValue(numerics);
}
if(t.matches("^[{<(]*("+ChunkedSentence.times+")\\b.*?")){
numerics += t+ " ";
pointer++;
numerics = numerics.replaceAll("[{(<>)}]", "");
String size = numerics.trim();
Chunk c = nextChunk();
while(c.toString().contains("character")){
numerics +=c.toString().replaceAll("(\\w+\\[|\\])", "")+" ";
c = nextChunk();
}
numerics +=c.toString();
if(c instanceof ChunkTHAN){
return new ChunkTHAN(numerics.replaceFirst(size, "size["+size+"]"));
}else{
//if(c instanceof ChunkTHANC){
// return new ChunkValue(numerics);//1.5-2 times n[size[{longer} than {wide}]]
//}else{
return new ChunkComparativeValue(numerics);//1-2 times a[shape[divided]]???; 1-2 times shape[{shape~list~pinnately~lobed~or~dissected}];many 2-4[-6+] times a[size[widths]];[0.5-]1.5-4.5 times u[o[(leaves)]];0.4-0.5 times u[o[(diams)]]
}
}
/*if(found && this.chunkedtokens.get(i).matches("^("+this.per+")\\b.*?")){
numerics += this.chunkedtokens.get(i)+ " ";
pointer = i+1;
return new ChunkBasedCount(numerics.replaceAll("[<>]", "").trim());
}*/
return new ChunkCount(numerics.replaceAll("[{()}]", "").trim());
}
//l/w: length/width
if(t.matches("l\\s*\\W\\s*w")){
while(!t.matches(".*?\\d.*")){
t = this.chunkedtokens.get(++this.pointer)+" ";
}
this.pointer++;
String next = this.chunkedtokens.get(this.pointer);//read next token
if(next.matches("^[{<(]*("+ChunkedSentence.percentage+").*")){
t += next.replaceAll("[{<()>}]", "")+ " ";
pointer++;
}
return new ChunkRatio(NumericalHandler.originalNumForm(t).trim(), "length/width");
}
//h/w:height/width
if(t.matches("h\\s*\\W\\s*w")){
while(!t.matches(".*?\\d.*")){
t = this.chunkedtokens.get(++this.pointer)+" ";
}
this.pointer++;
String next = this.chunkedtokens.get(this.pointer);//read next token
if(next.matches("^[{<(]*("+ChunkedSentence.percentage+").*")){
t += next.replaceAll("[{<()>}]", "")+ " ";
pointer++;
}
return new ChunkRatio(NumericalHandler.originalNumForm(t).trim(), "height/width");
}
return null;
}
/**
* needed for cases like "10 cm . long/broad/wide/thick", skip ". "
* @param pointer2
*/
private void adjustPointer4Dot(int pointer) {
//boolean iscase = false;
while(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().length()==0){
pointer++;
}
if(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().matches("\\.")){//optional
this.pointer++;
}
/*while(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().length()==0){
pointer++;
}
while(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().matches("[{(<]?(long|broad|wide|thick)[})>]?")){//required
pointer++;
iscase = true;
}
if(iscase){
this.pointer = pointer;
}*/
}
/**
*
* @return e.g. z[m[leaf] e[blade]], apex,
* margins and apexes
* {} <> <>
* {} ()
*/
public Chunk getNextOrgan() {
String organ = "";
boolean found = false;
int i = 0;
for(i = pointer; i<this.chunkedtokens.size(); i++){
String token = this.chunkedtokens.get(i);
if(token.startsWith("z[")){
pointer++;
return new ChunkOrgan(token);
}
if(token.startsWith("l[")){
pointer++;
return new ChunkNPList(token);
}
if(token.startsWith("u[")){
pointer++;
return new ChunkNonSubjectOrgan(token);
}
if(token.matches(".*?\\b("+ChunkedSentence.prepositions+")\\b.*") || token.matches(".*?[,;:\\.].*")){
break;
}
if(found && token.matches("\\b(and|or)\\b")){
found = false;
}
if(found && !token.matches(".*?[>)]\\]*$")){
pointer = i;
if(organ.matches("^\\w+\\[")){
organ = organ.replaceAll("(\\w+\\[|\\])", "");
}
organ = organ.replaceAll("[<(]", "(").replaceAll("[>)]", ")").trim();
return new ChunkOrgan("z["+organ+"]");
}
organ += token+" ";
if(token.matches(".*?[>)]\\]*$")){
found = true;
}
}
if(found){
pointer = i;
if(organ.matches("^\\w+\\[")){
organ = organ.replaceAll("(\\w+\\[|\\])", "");
}
organ = organ.replaceAll("[<(]", "(").replaceAll("[>)]", ")").trim();
return new ChunkOrgan("z["+organ+"]");
}
return null;
}
/**
* use the un-collapsedTree (this.tree) to check the type of a chunk with the id,
* @param i
* @return:
SBAR: s
VP: b[v/o]
PP: r[p/o]
VP-PP: t[c/r[p/o]]
ADJ-PP:t[c/r[p/o]]
Than: n
To: w
NPList: l
PPList: i
main subject: z[m/e]
non-subject organ/structure u[m[] relief[] o[]]
character modifier: a[m[largely] relief[smooth] m[abaxially]]
*/
private String chunkType(int id) {
String token = this.chunkedtokens.get(id);
if(token.matches("^\\w{2,}\\[.*")){
return "ChunkSL"; //state list
}
/*if(token.startsWith("q[")){
return "ChunkQP";
}*/
/*if(token.startsWith("s[")){
return "ChunkSBAR";
}*/
if(token.startsWith("b[")){//z[{longitudinal} (ridge)] b[v[{running}] o[the {length}]] r[p[of] o[the ({quadrate})]] laterally .
if(token.matches(".*\\)\\]+")){
return "ChunkVP";
}else if(token.indexOf(" o[")<0){//turn it into a simple character chunk, for example "*meet* posteriorly"
token = token.replaceAll("([bv]\\[|\\]|\\{|\\})", "").trim();
Utilities.insert2TermCategoryTable(token, "feature", conn, this.tableprefix);
token = "a[feature["+token+"]]";
this.chunkedtokens.set(id, token);
return "ChunkSimpleCharacterState";
}else{//z[{longitudinal} (ridge)] b[v[{running}] o[the {length}]] r[p[of] o[the ({quadrate})]] laterally .
String nexttoken = "";
int i = id+1;
while(nexttoken.length()==0 && i < this.chunkedtokens.size()){
nexttoken = this.chunkedtokens.get(i++);
}
if(nexttoken.matches("r\\[p.*?o\\[.*?\\)\\]+")){//merge
token = token.replaceFirst("\\] o\\[", " ").replaceFirst("\\]+", "").replaceAll("\\s+", " ");
nexttoken = nexttoken.replaceFirst("r\\[p\\[", "");
this.chunkedtokens.set(id, token+" "+nexttoken);
this.chunkedtokens.set(i-1, "");
return "ChunkVP";
}
return "ChunkVP"; //return positively anyway
}
}
//if(token.startsWith("r[") && token.indexOf("[of]") >= 0){
// return "ChunkOf";
//}
if(token.startsWith("r[")){
if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?.*("+ChunkedSentence.degree+")[}>)]?\\]+")){
token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "").replaceAll("[<{()}>]", "");
this.chunkedtokens.set(id, token);
return "ChunkValueDegree";
}else if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?.*("+ChunkedSentence.percentage+")[}>)]?\\]+")){
token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "").replaceAll("[<{()}>]", "");
this.chunkedtokens.set(id, token);
return "ChunkValuePercentage";
}else
//r[p[around] o[10 mm]] should be ChunkValue
if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?.*("+ChunkedSentence.units+")\\]+")){
token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "");
this.chunkedtokens.set(id, token);
return "ChunkValue";
}else if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?\\]+") && !token.matches(".*[�x]\\].*")){//r[p[at] o[30�]] is not a value
token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "");
this.chunkedtokens.set(id, token);
return "ChunkValue";
}else if(token.indexOf("o[")>=0 /*|| token.indexOf("c[")>=0*/){
//r[p[without] o[or r[p[with] o[{poorly} {developed} {glutinous} ({ridge})]]]] ;
token = token.replaceAll("r\\[p\\[of\\]\\]", "of");
this.chunkedtokens.set(id, token);
//r[p[for] o[{dorsal} 12 , {form}]] SG.SG
if(token.matches(".*? o\\[.*?, \\{\\w+\\}\\]+") && id >= this.chunkedtokens.size()-2){
token = token.replaceFirst(", \\{\\w+\\}(?=\\]{1,3})","");
this.chunkedtokens.set(id, token);
}
//nested preps
if(token.matches(".*?\\[p\\[\\w+\\] o\\[\\w+ r\\[p\\[.*")){
Pattern p = Pattern.compile("(.*?\\[p\\[\\w+)(\\] o\\[)(\\w+ )(r\\[p\\[)(.*)");
Matcher m = p.matcher(token);
if(m.matches()){
token = m.group(1)+" "+m.group(3)+m.group(5).replaceFirst("\\]\\]\\s*$", "");
this.chunkedtokens.set(id, token);
}
}
return "ChunkPrep";
}else if(token.indexOf("-as")>0 && !token.startsWith("n[")){//as-wide-as, same-width-as:r[p[{same-width-distally-as}]]
//a[intensity_level_or_thickness[thin]]
//repack as ChunkSimpleCharacterState
token = token.substring(token.lastIndexOf("[")+1, token.indexOf("]")).replaceAll("[{}]", ""); //same-width-distally-as
String charword = token.replaceFirst(".*?-", "").replaceFirst("-.*", "");
String chara = Utilities.lookupCharacter(charword, this.conn, ChunkedSentence.characterhash, glosstable, tableprefix);
if(chara==null) return null;
else{
token = token.replace("-", " ");
String nexttoken = this.chunkedtokens.get(id+1);
if(nexttoken.indexOf("[")<0){
token = "a["+chara+"["+token+" "+nexttoken+"]]";
this.chunkedtokens.set(id+1, "");
}else token = "a["+chara+"["+token+"]]";
this.chunkedtokens.set(id, token);
return "ChunkSimpleCharacterState";
}
}else{
return null;
}
}
if(token.startsWith("t[")){
//this was for FNAv19, but it seemed all t[ chunks were only generated by composeChunk, bypassing this step. t[ chunks generated by chunking does not seem to need this reformatting.
//reformat c[] in t[]: c: {loosely} {arachnoid} : should be m[loosely] architecture[arachnoid]
/*Pattern p = Pattern.compile("(.*?\\b)c\\[([^]].*?)\\](.*)");
Matcher m = p.matcher(token);
String reformed = "";
if(m.matches()){
reformed += m.group(1);
String c = reformCharacterState(m.group(2));
reformed += c+ m.group(3);
}
this.chunkedtokens.set(id, reformed);*/
return "ChunkCHPP"; //character/state-pp
}
if(token.startsWith("n[")){//returns three different types of ChunkTHAN
//n[{equal-to} or {greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]]
//n[{greater} than or {equal-to} {depth} r[p[of] o[{adjacent} (prearticular)]]]
//n[{as-long-as} or {greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]]
//n[{as-long-as} {depth} r[p[of] o[{adjacent} (prearticular)]]]
String beforethan = "";
String charword= "";
String beforechar = "";
String afterthan = "";
String chara = null;
String keyword = ""; //than, as long as, etc.
if(token.indexOf(" or ")>0 || token.startsWith("or ")){
//find
Pattern p = Pattern.compile("(\\bor\\b.*?\\b(?:than|to|as)\\b)"); //equal-to, same as, or same-as
Matcher m = p.matcher(token);
m.find();
keyword = "than"; // if "than" is part of " or " conjunction, then keyword is default to "than"
beforethan = token.substring(0, m.start()+m.group(1).length()+1).trim(); //including 'than': {equal-to} or {greater} than | {greater} than or {equal-to}
afterthan = token.substring(m.start()+m.group(1).length()+1).trim(); //anything follows before than
String temp = "";
if(beforethan.indexOf(" than ")>0)
temp = beforethan.substring(0, beforethan.indexOf(" than ")).trim();
if(beforethan.endsWith(" than"))
temp = beforethan.substring(0, beforethan.length()-4).trim();
if(temp.length()>0){
charword = temp.substring(temp.lastIndexOf(" ")>0? temp.lastIndexOf(" ") : temp.length()).trim(); //word before "than"
beforechar = "";
}
if(!charword.matches("("+ChunkedSentence.more+")")){
chara = Utilities.lookupCharacter(charword, this.conn, ChunkedSentence.characterhash, glosstable, tableprefix);
}
charword = beforethan.replaceFirst("n\\[", "").trim(); //make sure not lose 'equal to' before "greater than'
}else{
token = token.replaceAll("�", " degrees"); //� and % don't work well with \b in reg exp.
token = token.replaceAll("%", " percent");
if(token.matches(".*?as-.*?-as.*")){ //as-long-as case
Pattern p = Pattern.compile("(\\{?as-(?:"+ChunkedSentence.asasthan+")-as\\}?)");
Matcher m = p.matcher(token);
m.find();
keyword = m.group(1).replaceAll("[{}]", "").replaceAll("-", " ");
beforethan = token.substring(0, m.start()).trim().replaceFirst("n\\[", ""); //not including 'than'
afterthan = token.substring(m.start()+m.group(1).length()+1).trim();
charword = keyword.replaceAll("(^as | as$)", "").trim();
keyword = ""; //reset to "" as it is not needed in the final chunk
beforechar = beforethan;
}else{
Pattern p = Pattern.compile("\\b(than)\\b");
Matcher m = p.matcher(token);
m.find();
keyword = m.group(1);
beforethan = token.substring(0, m.start()).trim(); //not including 'than'
afterthan = token.substring(m.start()+m.group(1).length()+1).trim();
charword = beforethan.lastIndexOf(' ')>0 ? beforethan.substring(beforethan.lastIndexOf(' ')+1) : beforethan.replaceFirst("n\\[", "");
beforechar = beforethan.replace(charword, "").trim().replaceFirst("n\\[", "");
}
if(!charword.matches("("+ChunkedSentence.more+")")){
chara = Utilities.lookupCharacter(charword, this.conn, ChunkedSentence.characterhash, glosstable, tableprefix);
}
//afterthan = token.substring(token.indexOf(" than ")+6);
}
if(afterthan.indexOf(" than ")>0){//2nd than in the token
//'more than'... 2 times {longer} than {wide}]
String cp = afterthan;
afterthan = afterthan.replaceFirst(" than ", " constraint[than ")+"]";
token = token.replace(cp, afterthan);
}
//Case B: compared to numerical values
if(afterthan.matches(".*?\\d.*?\\b("+ChunkedSentence.units+"|"+ChunkedSentence.percentage+"|"+ChunkedSentence.size+")\\b.*") || afterthan.matches(".*?(\\d\\.\\d|%).*")){// "n[{longer} than 3 (cm)]" => n[size[{longer} than 3 (cm)]]
//'%\b' won't match '%'
if(chara==null){chara="size";}
//n[more than 4 times {maximum} {width}]=> put {width} part in constraint
//don't add another constraint in n[2 times {longer} constraint[than {wide}]]
if(afterthan.indexOf(" constraint[")<0 && afterthan.matches(".*?\\d.*?\\b("+ChunkedSentence.size+")\\b.*")){
String sizechara = afterthan.replaceFirst(".*?\\d.*?\\b("+ChunkedSentence.times+"|"+ChunkedSentence.percentage+") (?=[^\\d]+\\b("+ChunkedSentence.size+")\\b)", "");
String escaped = sizechara.replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}").replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]");
token = token.replaceFirst(escaped+"$", "constraint["+sizechara+"]");
}
token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
this.chunkedtokens.set(id, token);
return "ChunkTHAN"; //character
}else if(afterthan.matches(".*?.*?\\d.*?\\b("+ChunkedSentence.degree+")\\b.*") || afterthan.matches(".*?\\d\\.\\d.*")){// "n[{longer} than 3 (cm)]" => n[size[{longer} than 3 (cm)]]
if(chara==null){chara="orientation";}
token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
this.chunkedtokens.set(id, token);
return "ChunkTHAN"; //character
}
else if(afterthan.matches(".*?.*?\\b\\d\\b.*")){// "teeth more than 20"
if(chara==null){chara="count";}
token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
this.chunkedtokens.set(id, token);
return "ChunkTHAN";
}//Case C: compared to organs
else if(afterthan.indexOf("(")>=0){ //contains organ
if(chara==null){//is a constraint, lobed n[more than...]
token = "n["+token.replaceFirst("n\\[", "constraint[")+"]";
this.chunkedtokens.set(id, token);
return "ChunkTHAN";
}else{//n[more deeply lobed than...
token = "n["+(beforechar.length()>0? "m["+beforechar+"] ": "")+chara+"["+charword+"] constraint["+keyword+" "+afterthan+"]";
this.chunkedtokens.set(id, token);
return "ChunkTHAN";
}
}//Case A n[wider than long]: compare among characters
else{
token = "n["+(beforechar.length()>0? "m["+beforechar+"] ": "")+chara+"["+charword+"] constraint["+keyword+" "+afterthan+"]";
//token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
this.chunkedtokens.set(id, token);
//return "ChunkTHANC"; //character
return "ChunkTHAN";
}
}
if(token.startsWith("w[")){//w[{proximal} to the (florets)] ; or w[to (midvine)]
//reformat it to CHPP
if(token.indexOf("w[to ")>=0){
token = token.replaceFirst("w\\[to ", "r[p[to] o[")+"]";
this.chunkedtokens.set(id, token);
return "ChunkPrep";
}else{
token = token.replaceFirst("w\\[","t[c[").replaceFirst("(\\s+|\\b)to\\s+", "] r[p[to] o[")+"]]";
this.chunkedtokens.set(id, token);
return "ChunkCHPP";
}
}
if(token.startsWith("l[")){
return "ChunkNPList";
}
if(token.startsWith("i[")){
return "ChunkPPList";
}
if(token.startsWith("z[")){
return "ChunkOrgan";
}
if(token.startsWith("u[")){
return "ChunkNonSubjectOrgan";
}
return null;
}
/**
*
* @param group: {loosely} {arachnoid}
* @return:m[loosely] architecture[arachnoid]
*/
@SuppressWarnings("unused")
private String reformCharacterState(String charstring) {
String result = "";
String first = "";
String last = "";
if(charstring.lastIndexOf(' ')>=0){
last = charstring.substring(charstring.lastIndexOf(' ')).trim();
first = charstring.replace(last, "").trim();
result = "m["+first+"] ";
}else{
last = charstring.trim();
}
String c = Utilities.lookupCharacter(last, conn, characterhash, glosstable, tableprefix);
if(c!=null){
result += c+"["+last+"]";
}else if(Utilities.isVerb(last, verbs, notverbs)){
result += "v["+last+"]";
}
return result.trim();
}
/**
* when parsing fails at certain point, forward the pointer to the next comma
*/
public void setPointer2NextComma() {
for(; this.pointer<this.chunkedtokens.size(); pointer++){
if(this.chunkedtokens.get(pointer).matches("(,|\\.|;|:)")){
break;
}
}
}
public String getText(){
try{
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select originalsent from "+this.tableprefix+"_sentence where source ='"+sentsrc+"'");
if(rs.next()){
this.text = rs.getString(1); //has to use originalsent, because it is "ditto"-fixed (in SentenceOrganStateMarker.java) and perserve capitalization for measurements markup
}
}catch(Exception e){
e.printStackTrace();
}
return this.text;
}
public String getSubjectText(){
return this.subjecttext;
}
/*
private void findSubject(){
String senttag = null;
String sentmod = null;
String text = null;
String taggedtext = null;
//boolean islifestyle = false;//make this a post-process
try{
Statement stmt = conn.createStatement();
//ResultSet rs = stmt.executeQuery("select modifier, tag, originalsent from "+this.tableprefix+"_sentence where source ='"+sentsrc+"'");
ResultSet rs = stmt.executeQuery("select modifier, tag, originalsent from "+this.tableprefix+"_sentence where source ='"+sentsrc+"'");
if(rs.next()){
senttag = rs.getString(2).trim();
senttag = senttag.compareTo("general")==0? "ApplicationUtilities.getProperty("unknown.structure.name")" : senttag;
sentmod = rs.getString(1).trim();
this.text = rs.getString(3); //has to use originalsent, because it is "ditto"-fixed (in SentenceOrganStateMarker.java) and perserve capitalization for measurements markup
}
rs = stmt.executeQuery("select rmarkedsent from "+this.tableprefix+"_markedsentence where source ='"+sentsrc+"'");
if(rs.next()){
taggedtext = rs.getString(1).trim();
text = taggedtext.replaceAll("[{}<>]", "").trim();
}
}
catch(Exception e){
e.printStackTrace();
}
if(senttag.compareTo("ignore")!=0){
//sentence subject
if(senttag.compareTo("ApplicationUtilities.getProperty("unknown.structure.name")")==0){
this.subjecttext = "(ApplicationUtilities.getProperty("unknown.structure.name"))";
}else if(senttag.compareTo("chromosome")==0){
this.subjecttext = "(chromosome)";
skipLead("chromosome".split("\\s"));
}else if(senttag.compareTo("ditto")!=0 && senttag.length()>0){
//find the subject segment
String subject = "";
String [] tokens = text.split("\\s+");
if(senttag.indexOf("[")<0){
if(senttag.matches(".*\\b(or|and|plus)\\b.*")){// a , c, and/or b
int or = senttag.lastIndexOf(" or ");
int and = senttag.lastIndexOf(" and ");
int ind = or < and ? and : or;
int plus = senttag.lastIndexOf(" plus ");
ind = plus < ind ? ind : plus;
String seg = senttag.substring(ind).replaceAll("oo", "(oo|ee)").trim();// and/or b
if(seg.indexOf("(oo|ee)")>=0){
seg =seg.replaceFirst(".$", "\\\\w+\\\\b");
}else if(seg.length() < 5){
seg =seg.replaceFirst("..$", "\\\\w+\\\\b");
}else{
seg = seg.replaceFirst("...$", "\\\\w+\\\\b");
}
//seg = seg.replaceFirst("(and|or) ", "(and|or|plus|,) .*?");
seg = seg.replaceFirst("(and|or) ", "(\\\\band\\\\b|\\\\bor\\\\b|\\\\bplus\\\\b|,).*?\\\\b");
//tag derived from complex text expression: "biennial or short_lived perennial" from "iennials or short-lived , usually monocarpic perennials ,"
seg = seg.replaceAll("(?<=\\W)\\s+(?=\\W)", ".*?")
.replaceAll("(?<=\\W)\\s+(?=\\w)", ".*?\\\\b")
.replaceAll("(?<=\\w)\\s+(?=\\W)", "\\\\b.*?")
.replaceAll("(?<=\\w)\\s+(?=\\w)", "\\\\b.*?\\\\b");
Pattern p = Pattern.compile("(^.*?"+seg+")");
Matcher m = p.matcher(text.replaceAll("\\s*-\\s*", "_"));
if(m.find()){
subject = m.group(1);
subject = subject.replaceAll("\\s+-\\s+", "-");
if(skipLead(subject.split("\\s+"))<0){
this.subjecttext = null;
}else{
String organs = senttag.replaceAll("\\w+\\s+(?!(and |or |plus |$))", "|").replaceAll("\\s*\\|\\s*", "|").replaceAll("(^\\||\\|$)", "").replaceAll("\\|+", "|");//o1|o2
//turn organ names in subject to singular
String[] stokens = subject.split("\\s+");
subject = "";
for(int i = 0; i < stokens.length; i++){
String singular = TermOutputerUtilities.toSingular(stokens[i]);
if(singular.matches("("+organs+")")){
stokens[i] = singular;
}
subject += stokens[i]+" ";
}
subject = formatSubject(subject, taggedtext);
//subject = subject.trim().replaceAll("(?<=\\b("+organs+")\\b) ", ") ").replaceAll(" (?=\\b("+organs+")\\b)", " (").replaceFirst("(?<=\\b("+organs+")\\b)$", ")").replaceFirst("^(?=\\b("+organs+")\\b)", "(").trim();
//subject = subject.replaceAll("(?<=\\w) ", "} ").replaceAll(" (?=\\w)", " {").replaceAll("(?<=\\w)$", "}").replaceAll("^(?=\\w)", "{").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
this.subjecttext = subject;
}
}
}else{
for(int i = 0; i<tokens.length; i++){
if(TermOutputerUtilities.toSingular(tokens[i]).compareTo(senttag.replaceAll("_", ""))==0){
subject = subject.replaceAll("\\s+-\\s+", "-");
subject += tokens[i]+ " ";
//subject = "{"+subject.trim().replaceAll("[\\[\\]{}()]", "").replaceAll(" ", "} {")+"}";
//subject = (subject + " ("+tokens[i].replaceAll("[\\[\\]]", "").replaceAll(" ", ") (")+")").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
//this.subjecttext = addSentmod(subject, sentmod); not used in phenoscape annotation
this.subjecttext=formatSubject(subject.trim(), taggedtext);
if(subject.length()>0){
//skipLead(subject.replaceAll("[\\[\\]{}()]", "").split("\\s+"));
int skip = skipLead(subject.split("\\s+"));
if(skip==-1) this.subjecttext=null; //subject search failed.
break;
}
}else{
subject += tokens[i]+" ";
}
}
}
}else if(senttag.indexOf("[")>=0){// must not be of-case
subject = ("{"+sentmod.replaceAll("[\\[\\]]", "").replaceAll(" ", "} {")+"} ("+senttag.replaceAll("[\\[\\]]", "").replaceAll(" ", ") (")+")").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
this.subjecttext=subject;
String mt = (sentmod+" "+senttag).replaceAll("\\[+.+?\\]+", "").replaceAll("\\s+", " ").trim();
if(mt.length()>0)
skipLead(mt.split("\\s+"));
}
}else if(senttag.compareTo("ditto")==0){
if(sentsrc.endsWith("0")){
this.subjecttext ="(ApplicationUtilities.getProperty("unknown.structure.name"))";//it is a starting sentence in a treatment, without an explicit subject.
}else{
this.subjecttext ="ditto";
//mohan code :10/28/2011. If the subject is ditto and the first chunk is a preposition chunk make the subject empty so that it can search within the same sentence for the subject.
int j=0;
String text1 = "";
for(j=0;j<this.chunkedtokens.size();j++)
{
text1 = "";
text1 += this.chunkedtokens.get(j);//gets the first token to check if its a preposition
if(text1.compareTo("")!=0)
{
break;
}
}if(text1.matches("r\\[p\\[.*\\]")){
int i=0;
for(i=0;i<this.chunkedtokens.size();i++)
{
String text2="";
text2+=this.chunkedtokens.get(i);
if(text2.matches("(\\<.*\\>)"))
{
this.subjecttext =null;
break;
}
}
}
//End of mohan//
}
}
}else{
if(this.text.matches(".*?[A-Z]{2,}.*")){ //this.text must be originalsent where captalization is perserved.
this.subjecttext = "measurements";
}else{
this.subjecttext = "ignore";
}
}
if(this.subjecttext!=null && this.subjecttext.endsWith("}")){
this.subjecttext = null;
this.pointer = 0;
}
}*/
/**
* manual digit
* => (manual) (digit) or {manual} (digit) based on the tags used in taggedtext
* @param subject
* @param taggedtext
* @return
*/
@SuppressWarnings("unused")
private String formatSubject(String subject, String taggedtext) {
String[] tokens = subject.split("\\s+");
String formatted = "";
for(String t: tokens){
String tag = getTag(t, taggedtext);
if(tag.contains("<")){
formatted += "("+t+") ";
}else if(tag.contains("{")){
formatted += "{"+t+"} ";
}else{
formatted += t+" ";
}
}
formatted = formatted.trim();
//make sure the last word is in (), in case the word was not tagged with<> in taggedtext
if(!formatted.endsWith(")")){
int lasti = formatted.lastIndexOf(" ")<0 ? 0 : formatted.lastIndexOf(" ");
String lastw = formatted.substring(lasti).replaceAll("\\W", "").trim();
formatted = formatted.replaceAll(lastw, "("+lastw+")");
}
return formatted.replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
}
/**
*
* @param t: digit
* @param taggedtext: <manual> <digit>
* @return: <
*/
private String getTag(String t, String taggedtext) {
if(taggedtext.contains("<"+t+">")) return "<";
if(taggedtext.contains("<{"+t+"}>")) return "<";
if(taggedtext.contains("{"+t+"}")) return "{";
return "";
}
/**
* sent
* @param subject: {basal} (blade)
* @param sentmod basal [leaf]
* @return
*/
/*private String addSentmod(String subject, String sentmod) {
if(sentmod.indexOf("[")>=0){
String[] tokens = subject.split("\\s+");
String substring = "";
for(int i = 0; i<tokens.length; i++){
if(!sentmod.matches(".*?\\b"+tokens[i].replaceAll("[{()}]", "")+"\\b.*")){
substring +=tokens[i]+" ";
}
}
substring = substring.trim();
substring ="{"+sentmod.replaceAll("[\\[\\]]", "").replaceAll(" ", "} {").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ")+"} "+substring;
return substring;
}
return subject;
}*/
/**
*
* @param begainindex (inclusive)
* @param endindex (not include)
* @return element in the range
*/
public String getText(int begainindex, int endindex) {
String text = "";
for(int i = begainindex; i < endindex; i++){
text += this.chunkedtokens.get(i)+" ";
}
return text.replaceAll("\\s+", " ").trim();
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
public String getTokenAt(int i) {
//if out of bound
// if(i<0 || i>=this.chunkedtokens.size()){
// return null;
// }else{
// return this.chunkedtokens.get(i);
// }
try{
return this.chunkedtokens.get(i);
}catch(Exception e){
return null;
}
}
public void setClauseModifierConstraint(String modifier, String constraintId) {
this.clauseModifierConstraint = modifier;
this.clauseModifierContraintId = constraintId;
}
public ArrayList<String> getClauseModifierConstraint() {//apply to all characters in this chunkedsentence
if(this.clauseModifierConstraint!=null){
ArrayList<String> mc = new ArrayList<String>();
mc.add(this.clauseModifierConstraint);
if(this.clauseModifierContraintId!=null) mc.add(this.clauseModifierContraintId);
return mc;
}else{
return null;
}
}
}