ChunkedSentence.java example

Explorer
phenoscape-nlp-master
- parsing-gui
  - lib
    - elk-distribution-0.3.2-owlapi-library
      - examples
        org
        semanticweb
        elk
        owlapi
        examples
        QueryingUnnamedClassExpressions.java
        QueryingWithNamedClasses.java
        RetrievingInstances.java
        SavingInferredAxioms.java
  - src
    - com
      - swtdesigner
        SWTResourceManager.java
    - fna
- phenoscapeII
  - src
 /* $Id: ChunkedSentence.java 988 2011-09-23 16:44:53Z hong1.cui $ */
/**
 * 
 */
package fna.charactermarkup;

import java.lang.reflect.Constructor;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jdom.Document;
import org.jdom.Element;
import org.jdom.xpath.XPath;

import outputter.knowledge.TermOutputerUtilities;
import conceptmapping.*;




/**
 * 
 * @author hongcui
 *	This class generates a chunked sentence from the parsing tree and provides a set of access methods to facilitate final annotation.
 *	A chunked sentence is a marked sentence (with organs enclosed by <> and states by {}) with "chunks" of text enclosed by [], for example 
 *	<Heads> 3 , {erect} , [in corymbiform or paniculiform arrays]. (sent. 302)
 *
 *	the annotation of a chunk may require access to the original parsing tree, but that is not handled by this class.
 */

@SuppressWarnings("unchecked")
public class ChunkedSentence {
	private String glosstable = null;
	private String markedsent = null;
	private String chunkedsent = null;
	private ArrayList<String> chunkedtokens = null;
	@SuppressWarnings("unused")
	private ArrayList<String> charactertokensReversed = new ArrayList<String>();
	private int pointer = 0; //pointing at the next chunk to be annotated
	private String subjecttext = null;
	private String text = null;
	private String sentsrc = null;
	private String tableprefix = null;
	private Element root;
	public static final String binaryTvalues = "true|yes|usually";
	public static final String binaryFvalues = "false|no|rarely";
	public static final String pronouns = "them";
	public static final String locationpp="near|from";
	public static final String units= "cm|mm|dm|m|meter|meters|microns|micron|unes|�m|um";
	public static final String percentage="%|percent";
	public static final String degree="�|degree|degrees";
	public static final String times = "times|folds|lengths|widths";
	public static final String per = "per";
	public static final String more="greater|more|less|fewer";
	public static final String counts="few|several|many|none|numerous|single|couple";
	public static final String basecounts="each|every|per";
	public static final String pairs="pair|pairs|series|array|arrays|row|rows";
	public static final String clusters="cluster|clusters|involucre|involucres|rosette|rosettes|pair|pairs|series|ornament|ornamentation|array|arrays";
	public static final String prepositions = "above|across|after|along|among|amongst|around|as|at|before|behind|beneath|between|beyond|by|for|from|in|into|near|of|off|on|onto|out|outside|over|than|throughout|to|toward|towards|up|upward|with|without";
	public static final String stop = "a|about|above|across|after|along|also|although|amp|an|and|are|as|at|be|because|become|becomes|becoming|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|for|from|had|has|have|hence|here|how|if|in|into|inside|inward|is|it|its|may|might|more|most|near|no|not|of|off|on|onto|or|out|outside|outward|over|should|so|than|that|the|then|there|these|this|those|throughout|to|toward|towards|up|upward|was|were|what|when|where|which|why|with|within|without|would";
	public static final String skip = "and|becoming|if|or|that|these|this|those|to|what|when|where|which|why|not|throughout";
	public static final String positionprep = "of|part_of|in|on|between";
	public static final String asasthan = "long|wide|broad|tall|high|deep|short|narrow|thick"; //as-long-as wide
	public static final String size="long|longer|wide|wider|broad|broader|tall|taller|high|higher|deep|deeper|short|shorter|narrow|narrower|thick|thicker|length|width|height|depth|breadth";
	public static Hashtable<String, String> eqcharacters = new Hashtable<String, String>();
	private boolean inSegment = false;
	private boolean rightAfterSubject = false;
	private int sentid = -1;
	private ArrayList<String> pastpointers = new ArrayList<String>();

	public String unassignedmodifier = null;
	//caches
	public static Hashtable<String, String> characterhash = new Hashtable<String, String>();
	public static ArrayList<String> adverbs = new ArrayList<String>();
	public static ArrayList<String> verbs = new ArrayList<String>();
	public static ArrayList<String> nouns = new ArrayList<String>();
	public static ArrayList<String> notadverbs = new ArrayList<String>();
	public static ArrayList<String> notverbs = new ArrayList<String>();
	public static ArrayList<String> notnouns = new ArrayList<String>();
	
	protected Connection conn = null;
	/*static protected String username = "root";
	static protected String password = "root";
	static protected String database = "fnav19_benchmark";*/
	
	private boolean printNorm = false;
	private boolean printNormThan = false;
	private boolean printNormTo = false;
	private boolean printExp = false;
	private boolean printRecover = false;
	private String clauseModifierConstraint;
	private String clauseModifierContraintId;
	private String type;
	private String characters;
	

	
	public ChunkedSentence(ArrayList<String> chunkedtokens, String chunkedsent, Connection conn, String glosstable, String tableprefix){
		this.chunkedtokens = chunkedtokens;
		this.chunkedsent = chunkedsent;
		this.conn = conn;
		this.glosstable = glosstable;
		this.tableprefix = tableprefix;
		this.recoverOrgans();
		
	}
	/**
	 * @param tobechunkedmarkedsent 
	 * @param tree 
	 * 
	 */
	public ChunkedSentence(int id, Document collapsedtree, Document tree, String tobechunkedmarkedsent,  String sentsrc, String type, String tableprefix,Connection conn, String glosstable, String characters) throws Exception {
		eqcharacters.put("wide", "width"); //2 cm. wide
		eqcharacters.put("long", "length");
		eqcharacters.put("broad", "width");
		eqcharacters.put("diam", "diameter");
		eqcharacters.put("size", "size");
		eqcharacters.put("high", "height");
		eqcharacters.put("height", "height");
		eqcharacters.put("width", "width");
		eqcharacters.put("length", "length");
		eqcharacters.put("depth", "depth");
		eqcharacters.put("breadth", "width");
				
		this.tableprefix = tableprefix;
		this.glosstable = glosstable;
		this.characters = characters;
		this.conn = conn;
		this.type = type;
		
		/*try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select term from "+glosstable+" where category='character'");
			while(rs.next()){
				nouns.add(rs.getString("term"));
			}
		}catch(Exception e){
			e.printStackTrace();
		}*/
		
		nouns.addAll(Arrays.asList(characters.split("\\|")));
		
		this.sentsrc = sentsrc;
		this.sentid = id;
		this.markedsent = tobechunkedmarkedsent;
		//tobechunkedmarkedsent = tobechunkedmarkedsent.replaceAll("[\\[\\(]", " -LRB-/-LRB- ").replaceAll("[\\]\\)]", " -RRB-/-RRB- ").replaceAll("\\s+", " ").trim(); 
		tobechunkedmarkedsent = tobechunkedmarkedsent.replaceAll("[\\[\\(]", "-LRB-/-LRB-").replaceAll("[\\]\\)]", "-RRB-/-RRB-").trim(); 
		if(tobechunkedmarkedsent.matches(".*?\\d.*")){
			tobechunkedmarkedsent = NumericalHandler.normalizeNumberExp(tobechunkedmarkedsent);
		}
		String[] temp = tobechunkedmarkedsent.split("\\s+");
		chunkedtokens = new ArrayList<String>(Arrays.asList(temp)); //based on markedsent, which provides <>{} tags.
				
		root = collapsedtree.getRootElement();
		String treetext = SentenceChunker4StanfordParser.allText(root).trim();
		String[] treetoken = treetext.split("\\s+"); //based on the parsing tree, which holds some chunks.
		String realchunk = "";
		ArrayList<String> brackets = new ArrayList<String>();
		int i = 0;
		//go through treetoken to chunk state lists, and brackets
		for(; i<treetoken.length; i++){
			if(treetoken[i].matches("^\\S+~list~\\S+")){//r[p[of] o[{architecture~list~smooth~or~barbellulate~to~plumose} (bristles)]]
				//String[] parts = treetoken[i].split("~list~");
				//treetoken[i] = parts[0]+"["+parts[1]+"]"; 
				//treetoken[i] = treetoken[i].replace("~list~", "[{").replaceAll("\\{(?=\\w{2,}\\[)", "").replaceAll("(?<=~[a-z0-9-]{2,40})(\\}| |$)","}]");
				treetoken[i] = treetoken[i].replace("~list~", "[{").replaceAll("\\{(?=\\w{2,}\\[)", "").replaceAll("(?<=~[a-z0-9-]{1,40})(\\}| |$)","}]");
			}
		}		
		for(i= 0; i<treetoken.length; i++){
			if(treetoken[i].indexOf('[') >=0){
				int bcount  = treetoken[i].replaceAll("[^\\[]", "").trim().length();
				for(int j = 0; j < bcount; j++){
					brackets.add("[");
				}
			}

			if(brackets.size()>0){//in
				//restore original number expressions
				String w = treetoken[i].replaceAll("(\\w+\\[|\\])", "");
				realchunk += treetoken[i].replace(w, chunkedtokens.get(i))+" ";
				chunkedtokens.set(i, "");
			}
			
			if(treetoken[i].indexOf(']')>=0){
				int bcount  = treetoken[i].replaceAll("[^\\]]", "").trim().length();
				for(int j = 0; j < bcount; j++){
					brackets.remove(0);
				}
			}
			
			if(brackets.size()==0 && realchunk.length()>0){
				chunkedtokens.set(i, realchunk.replaceAll("<", "(").replaceAll(">", ")").trim()); //inside a chunk, an organ is marked by #. e.g. #leaves# 
				realchunk="";
			}
			
		}
		if(realchunk.length()>0){
			chunkedtokens.set(i-1+0, realchunk.trim());
		}		
		this.chunkedsent = "";
		
		
		int discoveredchunks = 0;

		discoveredchunks += normalizeThan();//do Than first before OtherINs
		/*OtherINs first: r[p[equal-to] o[or {greater} than {depth}]] r[p[of] o[{adjacent} (prearticular)]] .
		 *Than first: {equal-to} or n[{greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]] . This is desired results for ChunkTHAN
		 *Besides, it is important to group all Than cases as ChunkTHAN, not split them between ChunkPrep and ChunkTHAN
		 */
		discoveredchunks += normalizeOtherINs(); //find objects for those VB/IN that without
		discoveredchunks += normalizeBetween();
		//discoveredchunks += normalizeThan();
		discoveredchunks += normalizeTo();
		normalizeUnits();
		normalizePPList4Than(); //take care of orphaned 'equal-to or' 'as-short-as or'
		normalizeAsAsThan();
		int allchunks = chunks();
		StanfordParser.countChunks(allchunks, discoveredchunks);
		
		recoverSubjectOrgan4Character();
		recoverVPChunks();//recover unidentified verb phrases
		recoverConjunctedOrgans(); //
		//findSubject(); no longer needed //set the pointer to a place right after the subject, assuming the subject part is stable in chunkedtokens at this time
		recoverOrgans();
		recoverCharacter4OrganList();
		segmentSent();//insert segment marks in chunkedtokens while producing this.chunkedsent
		
		//TODO move this to an earlier place
		//if the last words in l[] are marked with {}, take them out of the chunk
		//if(this.chunkedsent.matches(".*?l\\[[^\\[].*?}\\].*")){
		//	removeStateFromList();
		//}
	}
	
	/**
	 * 1. as-long-as wide 
	 * 2. as-long-as organ
	 * 3. as-long-as width of organ
	 * form n[] chunk
	 */
	private void normalizeAsAsThan() {
		for(int i = 0; i< this.chunkedtokens.size(); i++){	
			String token = this.chunkedtokens.get(i);
			String chunk = token+" ";
			boolean success = false;
			if(token.matches("\\{?as-("+ChunkedSentence.asasthan+")-as\\}?")){//{as-long-as}: treat these as ChunkTHAN
				//looking for the 2nd part
				int j = 0; String t = "";
				for(j = i+1; j<this.chunkedtokens.size(); j++){
					t = this.chunkedtokens.get(j);
					if(t.length()!=0) break;
				}
				if(t.matches("\\{?("+ChunkedSentence.asasthan+")\\}?")){ //case 1
					chunk +=t+" ";
					success = true;
				}
				else if(t.matches("\\{?(height|width|length|depth|thickness)\\}?")){ //case 3
					chunk +=t+" ";
					for(int k = j+1; k < this.chunkedtokens.size(); k++){
						if(this.chunkedtokens.get(k).length()==0) continue;
						if(this.chunkedtokens.get(k).startsWith("r[p[of")){
							chunk += this.chunkedtokens.get(k)+" ";
							j = k;
							success = true;
							break;
						}
					}
				}
				if(!success){
					//case 2
					while(!t.startsWith("(") && !t.equals(",")){//found bony in {bony} (portion)
						chunk +=t+" ";
						if(j < this.chunkedtokens.size()-1) t = this.chunkedtokens.get(++j);
						else break;
						success = true;
					}
					while((t.length()==0 || t.startsWith("("))){ //found (portion)
						chunk +=t+" ";
						if(j < this.chunkedtokens.size()-1) t = this.chunkedtokens.get(++j);
						else break;
					}
				}

				//form n[chunk]
				if(success){
					this.chunkedtokens.set(i, "n["+chunk.trim()+"]");
					for(int k=i+1; k<=j; k++){
						this.chunkedtokens.set(k, "");
					}
				}
			}			
		}		
	}
	/**
	 * turn {equal-to} or n[{greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]] 
	 * to  n[{equal-to} or {greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]] 
	 * 
	 * note cases like n[less than or {equal-to} 35 {percent}] are already in the desired form. 
	 */
	private void normalizePPList4Than() {
		//search for n[] in 
		for(int i = 0; i< this.chunkedtokens.size(); i++){		
			if(this.chunkedtokens.get(i).startsWith("n[")){
				//search back to include proceeding prepositions
				String preps = "";
				int j;
				for(j = i-1; j >= 0; j--){
					String token = this.chunkedtokens.get(j);
					if(token.length()==0) continue;
					if(token.startsWith("r[") && token.indexOf("o[")<0){
						preps = token+" "+preps;
					}else if(token.replaceAll("[{}]", "").matches(ChunkedSentence.prepositions+"|"+POSTagger4StanfordParser.comprepstring+"|as-("+ChunkedSentence.asasthan+")-as")){//equal-to, as-long-as
						preps = token+" "+preps;
					}else if(token.matches("or|,")){
						preps = token+" "+preps;
					}else{//encounter first non-prep part, end search
						break;
					}
				}
				preps = preps.trim();
				while(preps.startsWith("or") || preps.startsWith(",")){
					//remove the leading (or|,)
					preps = preps.replaceFirst("^(or|,)($| )", ""); //preps could be just "or" --in some wired sentences
					j++;
				}
				if(preps.length()>0){
					for(int k = j+1; k<i; k++){
						this.chunkedtokens.set(k, "");
					}
					this.chunkedtokens.set(i, "n["+preps+" "+this.chunkedtokens.get(i).replaceFirst("n\\[", ""));
				}
			}
		}		
	}
	/**
	 * contact between organ a and organ b
	 * @throws Exception 
	 * 
	 * 
	 */	
	private void recoverSubjectOrgan4Character() throws Exception {
		//if type is character and the first non-empty chunk is not a noun
		if(type.equals("character")){
			if(this.chunkedtokens.size()<=2 && this.chunkedtokens.get(0).matches("\\w+")){
				if(this.chunkedtokens.size()>1 && !this.chunkedtokens.get(1).matches("\\w+")){
					this.chunkedtokens.set(0, "<"+this.chunkedtokens.get(0)+">");
					return;
				}
			}
			
			String token = "";
			int i = 0;
			while(token.length()==0 && i < this.chunkedtokens.size()){
				token = this.chunkedtokens.get(i);
				i++;
			}
			for(int j = i-1; j < this.chunkedtokens.size(); j++){ //process the leading bare tokens
				token = this.chunkedtokens.get(j);
				if(token.length()==0 || token.indexOf("[")>0 || token.indexOf("<")>=0 || token.indexOf("{")>=0) break;
				if(XPath.selectNodes(root, "//NN[@text='"+token+"']").size() > 0 &&
						!token.matches("("+this.characters+")")){//bare token
					token = "<"+token+">";
					this.chunkedtokens.set(j, token);					
				}
			}
		}		
	}
	/**
	 * count the chunks in chunkedtokens
	 * @return
	 */
	private int chunks() {
		int count = 0;
		Iterator<String> it = this.chunkedtokens.iterator();
		while(it.hasNext()){
			if(it.next().matches("[^l]\\[.*")){
				count++;
			}
		}
		return count;
	}
	/**
	 * scan through a chunkedtokens to find Verbs not parsed as such by the parser
	 * find verbs by
	 * 1. look into this.verbs
	 * 2. find pattern o ting/ted by o, then t must be a verb and save this verb in verbs
	 */
	private void recoverVPChunks() {
		for(int i = 0; i < this.chunkedtokens.size(); i++){
			String t = this.chunkedtokens.get(i);
			if(t.contains("-")) continue; //check 751
			if(!t.contains("[") && ChunkedSentence.verbs.contains(t)){
				recoverVPChunk(i);
			}else if(!t.contains("[") && (t.endsWith("ing")|| t.endsWith("ing}"))){
				 if(connects2organs(i)){
					 ChunkedSentence.verbs.add(t.replaceAll("\\W", ""));
					 recoverVPChunk(i);
				 }
			}/*else if(!t.contains("[")&& t.endsWith("ed") && this.chunkedtokens.size()>i+1 && this.chunkedtokens.get(i+1).matches(".*?\\bby\\b.*")){				
			}*/
		}
	}

	/**
	 * 
	 * @param i :index of the verb
	 * @return
	 */
	private boolean connects2organs(int i) {
		boolean organ1 = false;
		boolean organ2 = false;
		if(i>=1 && this.chunkedtokens.size()>i+1){
			String t = this.chunkedtokens.get(i-1);
			if(t.endsWith(">") || t.matches(".*\\bo\\[[^\\]\\[]*\\]+") || t.endsWith(")") ){
				organ1 = true;
			}
			
			do{
				i++;
				t = this.chunkedtokens.get(i).trim();
			}while(t.length()==0);
			if(t.endsWith(">") || t.matches("[uz]?\\[?\\bo\\[[^\\]\\[]*\\]+") || t.endsWith(")") ){
				organ2 = true;
			}
			
			/*for(int j = i+1; j < this.chunkedtokens.size(); j++){
				t = this.chunkedtokens.get(j);
				if(t.endsWith(">") || t.matches("[uz]?\\[?\\bo\\[[^\\]\\[]*\\]+") || t.endsWith(")") ){
					organ2 = true;
					break;
				}
				if((j == i+1 && t.equals(","))|| t.matches("\\w+")){
					organ2 = false;
					break;
				}
			}*/
		}
		return organ1 && organ2;
	}
	/**
	 * 
	 * @param i: the index of a possible verb
	 */
	private void recoverVPChunk(int i) {
	 
		String chunk = "";
		boolean foundo = false;
		int j = i+1;
		for(; j < chunkedtokens.size(); j++){
			//scan for the end of the chunk TODO: may refactor with normalizeOtherINs on this search
			String t = this.chunkedtokens.get(j);
			if(j==i+1 && t.matches(",")){ //verb not have object
				return;
			}
			if(t.matches("(;|\\.)")) break;
			if(foundo && (t.contains("{") || t.contains("~list~")||t.matches("(\\w+|,|;|\\.)")||t.contains("["))){
				break;
			}
			if(t.contains("<")){
				chunk += t+" ";
				foundo = true;
			}else if(t.matches(".*?\\bo\\[[^\\]*]+") || t.matches(".*?l\\[[^\\]]*\\]+")){//found noun)
				chunk += t+" ";
				foundo = true;
				j++;
				break;
			}else{
				chunk += t+" ";
			}						
		}
		if(!foundo) return;
		//format the chunk
		chunk = chunk.trim();
		if(chunk.endsWith(">")){
			chunk = "b[v["+this.chunkedtokens.get(i)+"]"+" o["+chunk.replaceAll("<", "(").replaceAll(">", ")")+"]]";
		}else if(chunk.matches(".*?\\bo\\[.*\\]+")){
			if(chunk.contains(" v[")){
				chunk = chunk.replaceFirst(" v[", " v["+this.chunkedtokens.get(i)+" ");
			}else if(chunk.matches("^r\\[.*")){//t[c[{extending}] r[p[to] o[(midvalve)]]]
				//chunk = chunk.replaceFirst("^r[p[", "b[v["+this.chunkedtokens.get(i)+ " "); //need to make the v is taken as a relation in processChunkVP
				chunk = "t[c["+this.chunkedtokens.get(i)+"] "+chunk+"]";
			}else if(chunk.startsWith("l[")){
				chunk = "b[v["+this.chunkedtokens.get(i)+"] "+chunk.replaceFirst("^l\\[", "o[")+"]"; 
			}else if(chunk.startsWith("u[")){
				chunk = chunk.replaceFirst("^u[", "b[v["+this.chunkedtokens.get(i)+ "] "); 
			}
		}
		//this.chunkedtokens.set(i, chunk);
		if(this.printRecover){
			System.out.println("verb chunk formed: "+chunk +" for \n"+this.sentid+"["+this.sentsrc+"]"+this.markedsent);
		}
		for(int k = i; k<j; k++){
			this.chunkedtokens.set(k, "");
		}
		this.chunkedtokens.set(j-1, chunk);
			/*
			t = t.replaceFirst("^u\\[", "").replaceFirst("\\]$", "");
			String o = t.substring(t.indexOf("o[")).trim();
			t = t.substring(0, t.indexOf("o[")).trim();
			if(t.length()>0){
				String[] states = t.split("\\s+");
				for(int k = 0; k < states.length; k++){
					String ch = TermOutputerUtilities.lookupCharacter(states[k], conn, characterhash, glosstable);
					if(ch!=null){
						scs = (scs.trim().length()>0? scs.trim()+"] ": "")+ch+"["+states[k].replaceAll("[{}]", "")+" ";
					}else{
						scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+states[k].replaceAll("[{}]", "")+" ";
					}
				}		
			}
			scs = (scs.trim().length()>0? scs.trim()+"] ": "")+o;
		}*/
		
	}
	/**
	 * attempts to mark modified non-subject organs as a chunk to avoid characters of these organs be attached to previous organs
	 * run this after recoverConjunctedOrgans to exclude organs that are objects of VP/PP-phrases)
	 * does not attempt to recognize conjunctions as the decisions may be context-dependent
	 */
	private void recoverOrgans() {
		//for(int i = this.chunkedtokens.size()-1; i >=this.pointer; i--){
		for(int i = this.chunkedtokens.size()-1; i >=0+0; i--){
			String t = this.chunkedtokens.get(i);
			if(t.endsWith(">") || t.endsWith(")")){//TODO: not dealing with nplist at this time, may be later
				recoverOrgan(i);//chunk and update chunkedtokens
			}
		}		
	}
	
	/**
	 * 
	 * @param last: the index of the last part of an organ name
	 */
	private void recoverOrgan(int last) {
		String chunk = this.chunkedtokens.get(last);
		boolean foundm = false; //modifiers
		boolean subjecto = false;
		int i = last-1;
		//for(;i >=this.pointer; i--){
		for(;i >=0; i--){
			String t = this.chunkedtokens.get(i);
			boolean isspatial = false;
			if(Utilities.isPosition(t.replaceAll("[{<>}]", ""), conn, this.glosstable)){
				t = t.replaceAll("\\{", "<").replaceAll("\\}", ">").replaceAll("<+", "<").replaceAll(">+", ">");
				isspatial =true;
			}
			/*preventing "the" from blocking the organ following ",the" to being matched as a subject organ- mohan 10/19/2011*/
			if(t.matches("the|a|an")){
				if(i!=0){
					i=i-1;
					t = this.chunkedtokens.get(i);
				}
			}			
			/*end mohan*/
			if((t.matches("\\{[\\w-]+\\}") && !isspatial)|| t.matches("(\\d+)") || t.contains("~list~")){
				chunk = t+" "+chunk;
				foundm = true;
			}else if(!foundm && (t.endsWith(">") ||t.endsWith(")") || isspatial )){ //if m o m o, collect two chunks
				chunk = t+" "+chunk;
			}else{
				if(t.equals(","))subjecto = true;
				else if((i==0 && t.matches("(a|an|the)"))){
					subjecto = true;
					this.chunkedtokens.set(0, ""); //remove the article
				}
				break;
			}
		}
		chunk = chunk.trim();
		
		//if(i==0) subjecto = true;
		//reformat this.chunkedtokens
		if(subjecto || i==-1){ 
			chunk = "z["+chunk.trim().replaceAll("<", "(").replaceAll(">", ")")+"]";
		}else{
			chunk = "u["+chunk.trim().replaceFirst("[<(]", "o[(").replaceFirst("[)>]$", ")]").replaceAll("<", "(").replaceAll(">", ")").replaceAll("[{}]", "")+"]";//<leaf><blade> => u[o[(leaf)(blade)]]
		}
		
		//reset from i+2 to last
		for(int j = i+1; j <last; j++){
			this.chunkedtokens.set(j, "");
		}
		while(i>=0 && this.chunkedtokens.get(i).length()==0){
			i--;
		}
		//if the previous nonempty chunk ends with a (), then merge this new u[] with the ()
		if(i>=0 && this.chunkedtokens.get(i).matches(".*\\)\\W*\\]$")){
			chunk = "("+chunk.replaceAll("(\\w+\\[|\\])", "").replaceAll(" ", ") (")+")";
			chunk = chunk.replaceAll("\\(+", "(").replaceAll("\\)+", ")");
			String previous = this.chunkedtokens.get(i);
			String p1 = previous.substring(0, previous.lastIndexOf(")")+1);
			previous = previous.replace(p1, p1+" "+chunk);
		    this.chunkedtokens.set(i, previous);
		    this.chunkedtokens.set(last, "");
		}else{
			//otherwise
			this.chunkedtokens.set(last, chunk);
		}
		if(this.printRecover){
			System.out.println("nsorgan chunk formed: "+chunk +" for \n"+this.sentid+"["+this.sentsrc+"]"+this.markedsent);
		}
	}
	/**
	 * attempts to include broken-away conjuncted organs to pp and vb phrase
	 */
	private void recoverConjunctedOrgans() {
		for(int i = 0; i < this.chunkedtokens.size(); i++){
			String t = this.chunkedtokens.get(i);
			if(this.chunkedtokens.size()>i+2){
				if((t.startsWith("r[p") || t.startsWith("b[v")) && 
						(this.chunkedtokens.get(i+1).matches("(and|or|plus)")||
								(this.chunkedtokens.get(i+1).matches(",") && this.chunkedtokens.get(i+2).matches("(and|or|plus)")))) {//check 211
					recoverConjunctedOrgans4PP(i);
				}else if((t.startsWith("r[p") || t.startsWith("b[v")) && this.chunkedtokens.get(i+1).startsWith("<")){//found a broken away noun
					int j = i;
					String newo = "";
					String o = this.chunkedtokens.get(++j);					
					do{
						newo += o;
						this.chunkedtokens.set(j, "");
						o = this.chunkedtokens.get(++j);					
					}while (o.startsWith("<"));
					String p1 = t.replaceFirst("\\]+$", "");
					String p2 = t.replace(p1, "");
					newo = newo.replaceAll("<", "(").replaceAll(">", ")").trim();
					t = p1+" "+newo+p2;
					this.chunkedtokens.set(i, "");
					this.chunkedtokens.set(--j, t);
				}
				
				
				/*else if (t.startsWith("b[v") && this.chunkedtokens.get(i+1).matches("(and|or|plus)")){
					recoverConjunctedOrgans4VB(i);
				}*/
			}
		}
	}
	
	/**
	 * a {fused} l[(cleithrum) and (suprascapula)] . 
	 * 
	 * should be
	 * 
	 * a l[{fused} (cleithrum) and (suprascapula)] . 
	 */
	private void recoverCharacter4OrganList(){
		for(int i = 0; i < this.chunkedtokens.size(); i++){
			String t = this.chunkedtokens.get(i);
			if(t.startsWith("l[")){
				for(int j=i-1; j>=0; j--){
					if(this.chunkedtokens.get(j).length()==0) continue;
					if(this.chunkedtokens.get(j).endsWith("}")){
						t = "l["+this.chunkedtokens.get(j)+" "+t.replaceFirst("l\\[", "");
						this.chunkedtokens.set(j, "");
						this.chunkedtokens.set(i, t);
					}else{
						j=-1; //get out of j-loop
					}
				}
			}
		}
	}

	/**
	 * recover if what follows the PP is "and|or|plus" and a (modified) organ followed by a , or a series of chunks
	 * @param i: the index where a PP-chunk followed by and|or|plus is found
	 */
	private void recoverConjunctedOrgans4PP(int i) {
		String recovered = this.chunkedtokens.get(i+1)+" ";//and|or|plus
		boolean foundo = false;
		boolean recover = true;
		int endindex = 0;
		for(int j = i+2; j < this.chunkedtokens.size(); j++){
			String t = this.chunkedtokens.get(j);
			if(!foundo && (t.matches("\\{\\w+\\}") || t.equals(",") || t.contains("~list~"))){//states before an organ 
				recovered += t+" ";
			}else if(t.matches("<\\w+>") || t.contains("l[")){//organ
				recovered += t+" ";
				endindex = j;
				foundo = true;
			}else if(foundo && t.matches("(,|;|\\.)")){//states before an organ 
				break; //organ followed by ",",  should recover
			}else if(foundo && t.contains("[") && !t.contains("~list~")){//found or not found organ
				//do nothing
			}else{
				recover = false;
				break;
			}
		}
		
		if(recover){
			//reformat: insert recovered before the last set of ] 
			String chunk = this.chunkedtokens.get(i);
			String p1 = chunk.replaceFirst("\\]+$", "");
			String p2 = chunk.replace(p1, "");
			recovered = recovered.replaceAll("<", "(").replaceAll(">", ")").trim();
			chunk = p1+" "+recovered+p2;
			this.chunkedtokens.set(i, "");
			//reset from i+1 to endindex
			for(int j = i+1; j <endindex; j++){
				this.chunkedtokens.set(j, "");
			}
			this.chunkedtokens.set(endindex, chunk);
			if(this.printRecover){
				System.out.println("pp/vp object chunk formed: "+chunk +" for \n"+this.sentid+"["+this.sentsrc+"]"+this.markedsent);
			}
			
		}
	}
	/**
	 * insert segment marks in chunkedtokens while producing this.chunkedsent
	 * after first round of segmentation, proceed to the 2nd round to disambiguate ", those of" 
	 */
	private void segmentSent() {
		int i;
		for(i = this.chunkedtokens.size()-1; i>=0; i--){
			String t = this.chunkedtokens.get(i);
			if(t.compareTo("") !=0){
				this.chunkedsent = t+" "+this.chunkedsent;;
			}
			if(t.indexOf('<')>=0 || t.indexOf("z[")>=0){//z[ is chunkOrgan
				for(i = i-1; i>=0; i--){
					String m = this.chunkedtokens.get(i);
					if(m.matches(".*?\\b("+ChunkedSentence.prepositions+")\\b.*")){
						this.chunkedsent = m+" "+this.chunkedsent;
						break; //has prepositions before <
					}
					//if(m.matches("(,|;|:)") && !suspend){
					if(m.matches("(,|;|:)")){
						this.chunkedtokens.set(i, "SG"+m+"SG"); //insert a segment mark
						this.chunkedsent = "SG"+m+"SG"+" "+this.chunkedsent;
						break;
					}else{
						if(m.compareTo("") !=0){
							this.chunkedsent = m+" "+this.chunkedsent;
						}
					}
					
				}
			}
		}
		if(this.chunkedtokens.get(this.chunkedtokens.size()-1).matches("\\W")){
			this.chunkedtokens.set(this.chunkedtokens.size()-1, "SG"+this.chunkedtokens.get(this.chunkedtokens.size()-1)+"SG");
		}
		this.chunkedsent.trim();
		disambiguateThose();
	}
	
	/**
	 * <corollas> {purple} , those of {sterile} <florets> � {expanded} , {exceeding} <corollas> of {fertile} <florets> , those of {fertile} <florets> 15-18 {mm} .
	 * <phyllaries> {many} in 6-8 <series>... , <apices> {shape~list~acute~to~acuminate} , those of {innermost} {bristly-ciliate-or-plumose} .
	 * find "those" instances in chunkedsent, fix chunkedsent, then fix chunkedtokens
	 * fix = replacing those with the subject of the last segment
	 */
	private void disambiguateThose() {
		Pattern p = null;
		if(this.chunkedsent.indexOf(" those r[p[of")>0){
			//p = Pattern.compile("((?:.*?SG\\WSG.*|^)<(.*?)>.*?)those(\\s+r?\\[?p?\\[?of.*)");
			p = Pattern.compile("((?:.*?SG\\WSG.*|^)(?:z\\[\\(|<)(.*?)(?:>|\\)\\]).*?)those(\\s+r?\\[?p?\\[?of.*)");
			Matcher m = p.matcher(this.chunkedsent);
			while(m.matches()){
				String noun = m.group(2);
				int indexOfthose = m.group(1).split("\\s+").length;
				//in case there are to~12~cm, need to adjust indexOfthose
				String textbeforethose = m.group(1);
				Pattern pt = Pattern.compile("(.*?)\\b(to~\\d+~(?:"+ChunkedSentence.units+").*?)\\b(.*)");
				Matcher mt = pt.matcher(textbeforethose);
				while(mt.matches()){
					textbeforethose = mt.group(3);
					indexOfthose += mt.group(2).replaceAll("[^~]", "").length();
					mt = pt.matcher(textbeforethose);
				}
				//update chunkedsent and chunkedtokens
				//"those" may be included in a chunk
				String token = this.chunkedtokens.get(indexOfthose);
				if(token.compareTo("those")==0){
					String temp = m.group(1).trim();
					temp = temp.replaceFirst(",$", "SG,SG");
					this.chunkedsent = temp+" <"+noun+">"+m.group(3);
					this.chunkedtokens.set(indexOfthose, "<"+noun+">");
					if(this.chunkedtokens.get(indexOfthose-1).compareTo(",")==0){
						this.chunkedtokens.set(indexOfthose-1, "SG,SG");
					}
				}else{//in a chunk: break the chunk into two
					int indexOfchunk = findChunk(indexOfthose, "those");
					String chunk = this.chunkedtokens.get(indexOfchunk);
					String[] two = chunk.split("\\s*those\\s*");
					two[0] += " ("+noun+")";
					//find how many closing brackets are needed in two[0] and form the two new chunks
					int lb = two[0].replaceAll("[^\\[]", "").length();
					int rb = two[0].replaceAll("[^\\]]", "").length();
					for(int i = 0; i<lb-rb; i++){
						two[0]+="]";
						two[1] = two[1].replaceFirst("\\]$", "");						
					}
					String newchunk = two[0]+" "+two[1];
					this.chunkedsent = this.chunkedsent.replace(chunk, newchunk);
					//replace the old chunk with two chunks in this.chunkedtokens 
					if(this.chunkedtokens.get(indexOfchunk+1).length()==0){
						this.chunkedtokens.set(indexOfchunk, two[0]);
						this.chunkedtokens.set(indexOfchunk+1, two[1]);
					}else if(this.chunkedtokens.get(indexOfchunk-1).length()==0){
						this.chunkedtokens.set(indexOfchunk-1, two[0]);
						this.chunkedtokens.set(indexOfchunk, two[1]);
					}
				}
				m = p.matcher(this.chunkedsent);
			}
		}
	}
	
	/**
	 * find the index in this.chunkedtokens that is near indexofkeyword and hold a chunk containing "keyword"
	 * @param indexOfkeyword
	 * @param keyword
	 * @return
	 */
	private int findChunk(int indexOfkeyword, String keyword) {
		//search downwards
		String chunk = "";
		int i = indexOfkeyword;
		do{
			i++;
			chunk = this.chunkedtokens.get(i);
		}while(chunk.length()==0);
		if(chunk.indexOf(keyword)>=0){
			return i;
		}
		//search upwards
		chunk = "";
		i = indexOfkeyword;
		do{
			i--;
			chunk = this.chunkedtokens.get(i);
		}while(chunk.length()==0);
		if(chunk.indexOf(keyword)>=0){
			return i;
		}		
		System.out.println("Wrong chunks in ChunkedSentence, System exiting.");
		System.exit(1); //should never reach here
		return 0;
	}
	/**
	 * l[(mid) and (distal) (cauline) {smaller}]
	 * ==>
	 * l[(mid) and (distal) (cauline)] {smaller}
	 */
	@SuppressWarnings("unused")
	private void removeStateFromList() {
		for(int i = 0; i<this.chunkedtokens.size(); i++){
			String t = this.chunkedtokens.get(i);
			if(t.matches("l\\[[^\\[]*?}\\]")){
				String list = t.substring(0, t.lastIndexOf(")")+1).trim();
				String state = t.replace(list, "").replaceFirst("\\]$", "").trim();
				list= list+"]";
				if(this.chunkedtokens.get(i+1).length()==0){
					this.chunkedtokens.set(i, list);
					this.chunkedtokens.set(i+1, state);
				}else if(this.chunkedtokens.get(i-1).length()==0){
					this.chunkedtokens.set(i-1, list);
					this.chunkedtokens.set(i, state);
				}else{
					System.err.println("removeStateFromList messed up");
				}
				this.chunkedsent = this.chunkedsent.replace(t, list+" "+state);
			}
			
		}
		
	}

	
	/**
	 * 3] {mm}
	 * 
	 */
	private void normalizeUnits(){
		for(int i = 0; i<this.chunkedtokens.size(); i++){
			String word = this.chunkedtokens.get(i);
			if(word.matches("[<{]("+ChunkedSentence.units+")[}>]")){
				if(i-1>=0){
					String latest = this.chunkedtokens.get(i-1);
					if(latest.matches(".*?\\d\\]+$")){
						String rest = latest.replaceAll("\\]+$", "").trim();
						String brackets = latest.replace(rest, "").trim();
						String norm = rest+ " "+word.replaceAll("[{}<>]", "")+brackets; //mm, not {mm}
						this.chunkedtokens.set(i-1, norm);
						this.chunkedtokens.set(i, "");
					}					
				}
			}
		}
	}
	
	/**
	 * shorter and wider than ...
	 * more/less smooth than ...
	 * pretty good now
	 */
	private int normalizeThan(){
		int count = 0;
		String np = "";
		int thani = 0;
		int firstmorei = this.chunkedtokens.size();
		String more = "";
		String preps = ChunkedSentence.prepositions.replaceFirst("\\bthan\\|", "").replaceFirst("\\bto\\|", "");
		if(this.markedsent.indexOf("than") >=0 ){
			if(this.printNormThan){
				System.out.println("Need to normalize Than! "+np);
			}
			for(int i = 0; i<this.chunkedtokens.size(); i++){
				//scan for JJRs
				String token = this.chunkedtokens.get(i);
				if(more.length()==0 && (token.matches(".*?\\b(\\w+er|more|less)\\b.*") && (token.indexOf("<")<0)|| this.markedsent.indexOf(token+" than")>=0)){ //<inner> is not, but <longer> than is 
					firstmorei = i;
					if(token.matches(".*?\\bmore\\b.*")){
						more = "more";
					}else if(token.matches(".*?\\b\\w+er\\b.*")){
						more = "er";
					}
				}else if(more.compareTo("er") == 0 && !token.matches(".*?\\b(\\w+er|more|less|and|or|than)\\b.*") ){
					more = "";
					firstmorei = this.chunkedtokens.size();;
				}
				if(token.matches(".*?\\bthan\\b.*")){
					//needs normalization
					thani = i;
					if(firstmorei < thani){
						//join all tokens between firstmorei and thani--this is the subject of "than"
						for(int j = firstmorei; j<=thani; j++){
							if(this.chunkedtokens.get(j).length()>0){
								np += this.chunkedtokens.get(j)+" ";
							}
							this.chunkedtokens.set(j, "");
						}
						
						//scan for the object of "than"
					
						for(i=i+1; i<this.chunkedtokens.size(); i++){
							String w = this.chunkedtokens.get(i).replaceAll("(\\<|\\>|\\{|\\}|\\w+\\[|\\])", "");
							//if(w.matches("\\b("+preps+"|and|or|that|which|but)\\b") || w.matches("\\W")){
							if(w.matches("\\b("+preps+"|and|that|which|but)\\b") || w.matches("\\p{Punct}")){ //should allow �, n[{shorter} than] � {campanulate} <throats>
								np = np.replaceAll("<", "(").replaceAll(">", ")").trim();
								this.chunkedtokens.set(thani, "n["+np+"]"); 
								count++;
								break;
							}else{
								if(this.chunkedtokens.get(i).length()>0){
									np += this.chunkedtokens.get(i)+" ";
								}
								this.chunkedtokens.set(i, "");
							}
						}
						if(this.printNormThan){
							System.out.println("Normalize Than! "+np);
						}
						thani = 0;
						firstmorei = this.chunkedtokens.size();
						np = "";
					}
				}
			
			}
		}
		return count;
	}
	
	/**
	 * expanded to <throats>
	 * to 6 m.
	 * 
	 */
	private int normalizeTo(){
		int count = 0;
		String np = "";
		boolean startn = false;
		//ArrayList<String> copy = (ArrayList<String>)this.chunkedtokens.clone();
		for(int i = 0; i<this.chunkedtokens.size(); i++){
			ArrayList<String> copy = (ArrayList<String>)this.chunkedtokens.clone();
			String token = this.chunkedtokens.get(i);
			if(token.compareTo("to") == 0 || token.matches(".*?\\bto]+$")){
				
				//scan for the next organ
				for(int j = i+1; j<this.chunkedtokens.size(); j++){
					String t = this.chunkedtokens.get(j).trim();
					if(j==i+1 && t.matches("\\d[^a-z]*")){//match "to 6[-9]" ; not match "to 5-lobed"
						copy = formRangeMeasure(i);
						break;
					}
					if(startn && t.indexOf('<')<0){
						break;
					}
					//to b[v[expose] o[(stigma)]]
					if(t.matches("[,:;\\d]") || t.matches(".*?\\b[pv]\\[.*") ||t.matches(".*?\\b("+ChunkedSentence.prepositions+"|and|or|that|which|but)\\b.*")){
						break;
					}
					np +=t+" ";
					this.chunkedtokens.set(j, "");
					if(t.lastIndexOf(' ') >=0){
						t = t.substring(t.lastIndexOf(' ')); //last word there
					}
					if(t.indexOf('<')>=0 || t.indexOf('(')>=0){ //t may have []<>{}
						startn = true; //not break yet, may be the next token is a noun
					}
				}
				if(!startn){
					this.chunkedtokens = copy; //not finding the organ, reset
				}else{
					if(this.printNormTo){
						System.out.println("To needs normalization!");
					}
					np = "to "+np;
					//scan forward for the start of the chunk
					boolean startc = false; //find the start of the chunk
					for(int j = i-1; j>=0; j--){
						String t = this.chunkedtokens.get(j);
						if(t.matches(".*?\\b("+ChunkedSentence.prepositions+"|and|or|that|which|but)\\b.*") || t.matches(".*?[>;,:].*") ||(t.matches("^\\w+\\[.*") && j!=i-1) ){ //the last condition is to avoid nested chunks. cannot immediately before w[].e.g: b[v[{placed}] o[{close}]] w[to {posterior} (shell) (margin)] ; 
							np = np.replaceAll("<", "(").replaceAll(">", ")").replaceAll("\\s+", " ").trim();
							//np = np.replaceAll("\\s+", " ").trim();
							this.chunkedtokens.set(i, "w["+np+"]"); //replace "to" with np
							count++;
							startn = false;
							startc = true;
							if(this.printNormTo){
								System.out.println("!normalizedTo! "+np);
							}
							break;
						}else{	
							np = t+" "+np;
							this.chunkedtokens.set(j, "");
						}
					}
					if(!startc){
						this.chunkedtokens = copy; //not finding the start of the chunk, reset
					}
			}
		}
	}
		return count;
}
	/**
	 * form a chunk if a pattern "to # unit" is found starting from i
	 * @param i: index of "to", which is followed by a number
	 * @return this.chunkedtokens
	 */
	private ArrayList<String> formRangeMeasure(int i) {
		String chunk = "to~"+this.chunkedtokens.get(i+1)+"~"; //"to"
		if(this.chunkedtokens.size()>i+2){
			String unit = this.chunkedtokens.get(i+2).replaceAll("\\W", " ").trim();
			if(unit.matches("("+ChunkedSentence.units+")")){
				chunk += unit;
				this.chunkedtokens.set(i+2, chunk);
				this.chunkedtokens.set(i+1, "");
				if(this.chunkedtokens.get(i).equals("to")){
					this.chunkedtokens.set(i, "");
				}else{
					this.chunkedtokens.set(i, this.chunkedtokens.get(i).replaceFirst("\\s+to(?=\\W+$)", ""));
				}
			}
		}		
		return this.chunkedtokens;
	}
	
	
	/**between 5 and 10
	 * between the frontal and the sphenotic spine 
	 * between anterior supraneural bone and neural spine of vertebra 4 
	 * between neural arches of vertebrae 3 and 4
	 * @return
	 * 5-10
	 * r[p[between] o[the frontal and the sphenotic spine]] 
	 * r[p[between] o[anterior supraneural bone and neural spine]] of vertebra 4 
	 * r[p[between] o[neural arches]] of vertebrae 3 and 4
	 * 
	 * 
	 * what about:
	 * 948[Armbruster_2004.xml_ffbaa153-5288-4671-866c-33d14c78c44e.txt-0]: 
	 * <space> r[p[between] o[{posterior} (process)]] r[p[of] o[(coracoid) (strut) and {posterior} (process)]] r[p[of] o[(coracoid)]]
	 */
	private int normalizeBetween(){
		int count = 0;
		for(int i = 0; i<this.chunkedtokens.size(); i++){
			String token = this.chunkedtokens.get(i);			
			if(token.matches(".*?\\bbetween\\b.*")){//between
				if(this.printNorm){
					System.out.println(token+" needs normalization!");	
				}
				if(token.matches("r\\[.*? and .*?\\]")){//already a chunk, fix the format: r[p[between] the {frontal} and o[the (sphenotic) ({spine})]]
					token = token.replaceFirst("o\\[", "").replaceFirst("\\]\\s*", "] o[");
					this.chunkedtokens.set(i, token);
					return ++count;
				}
				String chara = Utilities.lookupCharacter(this.chunkedtokens.get(i+1).replaceAll("[<>(){}\\]\\[]", ""), conn, characterhash, this.glosstable, this.tableprefix);
				if(this.chunkedtokens.get(i+1).matches("\\d+.*") || (chara!=null && chara.compareToIgnoreCase("structure")!=0)){
					//deal with "between 5 and 10" => "5-10"
					//between red and purple => red to purple
					count += normalizeBetweenCharacters(i);
				}else{
					//find the nearest "and" that is not separated from "between" by any stopwords or puncts
					//if such "and" can not be found, find the nearest pl structure terms
					count += normalizeBetweenStructures(i);
				}
				
			}
		}
		return count;
	}
	
	private int normalizeBetweenCharacters(int prepindex) {
		// TODO Auto-generated method stub
		return 0;
	}
	
	/**normalize one instance of "between"
	 * find the nearest "and" that is not separated from "between" by any stopwords or puncts
	 * if such "and" can not be found, find the nearest pl structure terms
	 * @param the index for the prep (between) and it is the starting point for the search in chunkedtoken
	 * @return
	 * r[p[between] o[the frontal and the sphenotic spine]] 
	 * r[p[between] o[anterior supraneural bone and neural spine]] of vertebra 4 
	 * r[p[between] o[neural arches]] of vertebrae 3 and 4
	 */
	private int normalizeBetweenStructures(int prepindex) {
		int nearestN1 = 0;
		int nearestN2 = 0;
		int nearestAND = 0;
		for(int i = prepindex+1; i < this.chunkedtokens.size(); i++){
			String token = this.chunkedtokens.get(i);
			if(nearestAND ==0 && (token.matches(".*?(\\b("+ChunkedSentence.prepositions+")\\b|,|\\.).*"))){
				//failed to find "and", make the chunk stop by nearestN1
				//check this before checking for < or (
				return makeChunk4Between(prepindex, nearestN1);
				
			}
			if(nearestAND == 0 && ((token.contains("<") || token.contains("(")))){
				nearestN1 = i;
			}
			if(nearestAND == 0 && (token.compareToIgnoreCase("and")==0)){
				nearestAND = i;
			}

			if(nearestAND > 0 && ((token.contains("<") || token.contains("(")))){
				nearestN2 = i;
			}
			if(nearestAND > 0 && nearestN2>0 && !token.contains("<") && !token.contains("(")){
				//find the 2nd organ, make the chunk
				return makeChunk4Between(prepindex, nearestN2);
			}
			if(nearestAND > 0 && (token.matches(".*?(\\b("+ChunkedSentence.prepositions+")\\b|,|\\.).*"))){
				//failed to find nearestN2, make the chunk stop now
				return makeChunk4Between(prepindex, i-1);				
			}
		}
		return 0;
	}
	/**
	 * form a chunk using all tokens from prepindex to endindex
	 * reset all these tokens in chunkedtokens
	 * put the chunk at the prepindex
	 * @param prepindex
	 * @param endindex
	 */
	private int makeChunk4Between(int prepindex, int endindex) {
		//String chunk = "r[p["+this.chunkedtokens.get(prepindex)+"] o[";
		if(endindex <= prepindex) return 0;
		String chunk = "";
		for(int i = prepindex+1; i<=endindex; i++){
			String t = this.chunkedtokens.get(i);
			t = t.contains("<") || Utilities.isPosition(t.replaceAll("[{}]", ""), conn, this.glosstable)? "("+t.replaceAll("[<>(){}]", "")+")": t;
			chunk +=t+" ";
			this.chunkedtokens.set(i, "");
		}
		if(this.chunkedtokens.get(prepindex).contains("[between]")){
			chunk = this.chunkedtokens.get(prepindex).replaceAll("\\]+$", " ")+chunk.trim()+"]]";
		}else{ //bare word between
			chunk = "r[p["+this.chunkedtokens.get(prepindex)+"] o["+chunk.trim()+"]]";
		}
		this.chunkedtokens.set(prepindex, chunk);
		return 1;
	}
	/**
	 * most [of] lengths
	 * [in] zyz arrays
	 */
	private int normalizeOtherINs(){
		
		//boolean startn = false;
		int count = 0;
		String preps = ChunkedSentence.prepositions.replaceAll("\\b(than|to|between)\\|", "");
		for(int i = 0; i<this.chunkedtokens.size(); i++){
			String token = this.chunkedtokens.get(i);
			
			if(token.matches(".*?p\\[\\{?[a-z]+\\}?\\]+") || token.matches(".*?\\b("+preps+")\\b\\]*$") ||
					token.matches(".*?\\b(as-.*?-as|same-.*?-as|\\w+-to|in-.*?-(with|to))\\b.*?")){//[of] ...onto]]
				token = token.replaceAll("[{}]", "");
				if(this.printNorm){
					System.out.println(token+" needs normalization!");
				}
				// a prep is identified, needs normalization
				ArrayList<String> copy = (ArrayList<String>)this.chunkedtokens.clone();
				//String nscopy = null;
				String npcopy = null;
				ArrayList<String> ctcopy = null;
				boolean startn = false;
				String np = "";
				//String ns = "";
				boolean foundorgan = false;
				//boolean ofnumber = false;
				//lookforward in chunkedtokens to find the object noun
				int j = 0;
				for(j = i+1; j<this.chunkedtokens.size(); j++){
					String t = this.chunkedtokens.get(j).trim();
					if(j==i+1 && t.matches("^[,;\\.]")){//"smooth throughout, ", but what about "smooth throughout OR hairy basally"?
						if(this.printNorm){
							System.out.println("encounter ',' immediately, no object is expected");
						}
						break; 						
					}
					/*if(t.startsWith("r[p[") && !np.matches(".*?\\b(or|and)\\b\\s+$")){
						npcopy = np;//TODO: 4/14/2011 check out 501.txt-4, 502.txt-5 "after flowering, 10 cm in fruit" 512.txt-11 "differing from inner, highly variable in <color>"
						break;
					}*/
					if(!foundorgan && startn && t.indexOf('<')<0 && t.indexOf('(')<0 && !Utilities.isNoun(t, nouns, notnouns)){ //test whole t, not the last word once a noun has been found
						//save ns for now, but keep looking for organs
						//nscopy = nscopy == null ? ns : nscopy; //keep only the first copy
						npcopy = npcopy == null? np : npcopy;
						ctcopy = ctcopy == null? (ArrayList<String>)this.chunkedtokens.clone():ctcopy;
					}
					//if(startn && !foundorgan && ishardstop(j)){
					if(!foundorgan && ishardstop(j)){
						//hard stop encountered, break
						//ns = nscopy;
						if(npcopy!=null && ctcopy!=null){
							np = npcopy;
							this.chunkedtokens = ctcopy;
						}
						break;
					}
					
					if(foundorgan && t.indexOf('<')<0 && t.indexOf('(')<0){ //test whole t, not the last word once a noun has been found
						break; //break, the end of the search is reached, found organ as object
					}
					
					np +=t+" "; //any word in betweens
					this.chunkedtokens.set(j, "");
					
					if(t.indexOf('<')>=0 ||t.indexOf('(')>=0){ //t may have []<>{}
						startn = true; //not break yet, may be the next token is also a noun
						foundorgan = true;
					}
					
					if(!foundorgan && Utilities.isNoun(t, nouns, notnouns)){ //t may have []<>{}
						startn = true; //won't affect the value of foundorgan, after foundorgan is true, "plus" problem
						if(TermOutputerUtilities.isPlural(t)){
							foundorgan = true;
							np = np.trim();
							if(np.lastIndexOf(" ")>0){
								np = np.substring(0, np.lastIndexOf(" "))+" "+ "("+t.replaceAll("\\W", "")+") ";
							}else{
								np = "("+np.replaceAll("\\W", "")+") ";
							}
						}
					}
				}
				
				/*
				 for(int j = i+1; j<this.chunkedtokens.size(); j++){
					String t = this.chunkedtokens.get(j).trim();
					if(startn && t.indexOf('<')<0 && !TermOutputerUtilities.isNoun(t, nouns)){ //test whole t, not the last word once a noun has been found
						break; //break, the end of the search is reached
					}
					np +=t+" ";
					this.chunkedtokens.set(j, "");
					
					if(t.indexOf('<')>=0 ||t.indexOf('(')>=0 || TermOutputerUtilities.isNoun(t, nouns)){ //t may have []<>{}
						startn = true; //not break yet, may be the next token is a noun
						ns += t+" ";
					}
				} 
				 */
				//form the normalized chunk
				if(foundorgan || npcopy!= null /*|| ofnumber*/){
					//ns = ns.trim();
					//if(!ns.endsWith("]")){ //not already a chunk
						//np = np.replace(ns, "").trim();
						//ns  = "("+ns.replaceAll("[{(<>)}]", "").replaceAll("\\s+", ") (")+")"; //mark the object as organ word by word
						//np = (np.replaceAll("<", "(").replaceAll(">", ")")+" "+ns).trim();
						np = np.replaceAll("<", "(").replaceAll(">", ")").replaceAll("\\s+", " ").trim();
					//}
					String symbol = "o";	
					/*if(ofnumber){
						symbol = "c";
					}*/
					if(token.indexOf('[')>=0){
						String rest = token.replaceFirst("\\]+$", "").trim();
						String brackets = token.replace(rest, "").replaceFirst("\\]$", "").trim();
						token = rest + "] "+symbol+"["+np.trim()+"]"+brackets;
						this.chunkedtokens.set(i, token);
						if(this.printNorm){
							System.out.println("!normalized!: "+token);
						}
					}else{//without [], one word per token
						token = "r[p["+token+"] "+symbol+"["+np.trim()+"]]";
						this.chunkedtokens.set(i, token);
						if(this.printNorm){
							System.out.println("!normalized!: "+token);
						}
					}
					count++;
				}else{ 
					if(j-i==1){
						//cancel the normalization attempt on this prep, return to the original chunkedtokens
						this.chunkedtokens = copy;
					}else if(np.matches(".*? [\\d+%]$")){//reached the end of the sentence.This is the case for "plumose on distal 80 % ."?
						//also the same width dorsally as proximally
						this.chunkedtokens = copy;
						//np = np.replaceAll("\\s+", " ").trim();
						String head = token.replaceFirst("\\]+$", "").trim();
						String brackets = token.replace(head, "").replaceFirst("\\]$", "").trim();
						String rest = np.replaceFirst(".*?(?=(\\.|;|,|\\band\\b|\\bor\\b|\\w\\[))", "").trim();
						np = np.replace(rest, ""); //perserve spaces for later
						String object = np.replaceAll("\\s+", " ").trim();
						if(object.length()>0){
							token = head + "] o["+np.replaceAll("\\s+", " ").trim()+"]"+brackets;
							this.chunkedtokens.set(i, token);
							int npsize = np.split("\\s").length; //split on single space to perserve correct count of tokens
							for(int k = i+1; k<=i+npsize; k++){
								this.chunkedtokens.set(k, "");
							}
							if(this.printNorm){
								System.out.println("!default normalized to (.|;|,|and|or|r[)!: "+token);
							}
							count++;
						}
					}else{
						//cancel the normalization attempt on this prep, return to the original chunkedtokens
						this.chunkedtokens = copy;
					}
				}
			}
			//i=i+1;

		}
		/*if(!startn){
			this.chunkedtokens = copy;
		}*/
		return count;
	}
	

	

	private boolean ishardstop(int j) {
		String t1 = this.chunkedtokens.get(j).trim();
		
		if(t1.matches("^\\w\\[.*")){
			return true;
		}
		if(t1.startsWith(".")){
			return true;
		}
		
		if(this.chunkedtokens.size()==j+1){
			return true;
		}

		String t2 = this.chunkedtokens.get(j+1).trim();
		if(t1.startsWith(",") && t2.matches("^\\W*[<(].*")){
			return true;
		}
		return false;
	}
	public String toString(){
		return this.chunkedsent;
	}
	public int getPointer(){
		return this.pointer;
	}
	
	//end mohan code
	public void setInSegment(boolean yes){
		this.inSegment = yes;
	}
	
	public void setRightAfterSubject(boolean yes){
		this.rightAfterSubject = yes;
	}
	

	public boolean hasNext(){
		if(pointer <this.chunkedtokens.size()){
			return true;
		}
		return false;
	}
	
	public int getSize(){
		return this.chunkedtokens.size();
	}
	
	public Chunk nextChunk(){
		
		Chunk ck = getNextChunk();
		while(ck==null && this.hasNext()){
			ck=this.getNextChunk();
		}
		if(ck instanceof ChunkOrgan){
			this.rightAfterSubject = true;
		}else{
			this.rightAfterSubject = false;
		}
		return ck==null? new ChunkEOS(".") : ck;
		
	}
	/**
	 * returns the next Chunk: may be a
	 * Organ, Value, Comparative Value, SimpleCharacterState, Subclause,
	 * PrepChunk, IVerbChunk (Intransitive verb chunk, followed by a preposition), VerbChunk, ADJChunk
	 * @return
	 */
	@SuppressWarnings("rawtypes")
	public Chunk getNextChunk(){
		Chunk chunk = null;
		String token = this.chunkedtokens.get(pointer);////a token may be a word or a chunk of text
		while(token.trim().length()==0){
			pointer++;
			token = this.chunkedtokens.get(pointer);
		}
		token = token.compareTo("�")==0? "moreorless" : token;
		token = token.matches(".*?\\d.*")? NumericalHandler.originalNumForm(token) : token;
		
		if(token.contains("relative~")){
			pointer++;
			return new ChunkCharacterComparison(token);
		}
		//all tokens: 
		//number:
		//if(token.matches(".*?\\d+$")){ //ends with a number
		if(NumericalHandler.isNumerical(token) ||token.matches("^to~\\d.*")|| token.matches("h\\s*\\W\\s*w")|| token.matches("l\\s*\\W\\s*w")){//l-w or l/w
				chunk = getNextNumerics();//pointer++;
				if(this.unassignedmodifier != null){
					chunk.setText(this.unassignedmodifier+ " "+chunk.toString());
				}
				return chunk;
		}
		
		if(token.indexOf("�")>0 && token.length()>0 && token.indexOf(" ")<0){
			//token: 4-9cm�usually15-25mm			
			String[] dim = token.split("�");
			boolean isArea = true;
			int c = 0;
			for(int i = 0; i<dim.length; i++){
				isArea = dim[i].matches(".*?\\d.*") && isArea;
				c++;
			}
			if(isArea && c>=2){
				token = token.replaceAll("�[^0-9]*", " � ").replaceAll("(?<=[^a-z])(?=[a-z])", " ").replaceAll("(?<=[a-z])(?=[^a-z])", " ").replaceAll("\\s+", " ").trim();
				chunk = new ChunkArea(token);
				pointer++;
				return chunk;
			}
		}

		if(token.indexOf("=")>0){//chromosome count 2n=, FNA specific
			String l = "";
			String t= this.chunkedtokens.get(pointer++);
			while(t.indexOf("SG")<0){
				l +=t+" ";
				t= this.chunkedtokens.get(pointer++);				
			}
			l = l.replaceFirst("\\d[xn]=", "").trim();
			chunk = new ChunkChrom(l);
			return chunk;
		}
		//create a new ChunkedSentence object for bracketed text 
		if(token.startsWith("-LRB-/-LRB-")){
			ArrayList<String> tokens = new ArrayList<String>();
			String text = "";
			if(token.indexOf("-RRB-/-RRB-")<0){
				String t = this.chunkedtokens.get(++this.pointer);
				while(!t.endsWith("-RRB-/-RRB-")){
					tokens.add(t);
					text += t+ " ";
					if(this.pointer+1 < this.chunkedtokens.size()) t = this.chunkedtokens.get(++this.pointer); //missing RRB
					else break;
				}
			}
			text=text.trim();
			if(text.length()>0){ //when -LRB- and -RRB- are on the same line, text="" for example, as in -LRB-/-LRB-3--RRB-/-RRB-5-{merous} (3-)5-{merous}
				this.pointer++;
				if(!text.matches(".*?[,;\\.:]$")){
					text +=" .";
					tokens.add(".");
				}
				Chunk c = new ChunkBracketed(text);
				c.setChunkedTokens(tokens);
				return c;
			} //else, continue on
		}
		//create a new ChunkedSentence object
		if(token.startsWith("s[")){
			ArrayList<String> tokens = new ArrayList<String>();
			String text = token.replaceFirst("s\\[", "").replaceFirst("\\]$", "");
			//break text into correct tokens: s[that is {often} {concealed} r[p[by] o[(trichomes)]]] ;
			tokens = Utilities.breakText(text);
			this.pointer++;
			text=text.trim();
			if(!text.matches(".*?[,;\\.:]$")){
				text +=" .";
				tokens.add(".");
			}
			Chunk c = new ChunkSBAR(text);
			c.setChunkedTokens(tokens);
			return c;
		}
		if(token.matches("\\W") ){//treat L/RRBs as either , or null
			pointer++;
			this.unassignedmodifier = null;
			return new ChunkComma("");
		}
		
		if(token.matches("\\b(and|either)\\b")){
			pointer++;
			this.unassignedmodifier = null;
			return null;
		}
		//end of a segment
		if(token.matches("SG[;:\\.]SG")){
			this.inSegment = false;
			pointer++;
			//this.unassignedmodifier = null;
			return new ChunkEOL(""); //end of line/statement
		}
		
		if(token.matches("SG,SG")){
			this.inSegment = false;
			pointer++;
			this.unassignedmodifier = null;
			return new ChunkEOS("");//end of segment/substence
		}
		
		//start of a segment
		if(!this.inSegment){
			this.inSegment = true;
			chunk = getNextOrgan();//pointer++
			if(chunk != null){
				this.unassignedmodifier = null;
				return chunk;
			}
		}
		
		//all chunks
		if(token.matches("^\\w+\\[.*")){
			String type = chunkType(pointer);
			token = this.chunkedtokens.get(pointer); //as checkType may have reformatted token.
			try{
				if(type != null){
					Class c = Class.forName("fna.charactermarkup."+type);
					Constructor cons = c.getConstructor(String.class);
					pointer++;
					//deal with any unassignedmodifier when EOS is approached.
					//if(this.unassignedmodifier != null && this.chunkedtokens.get(pointer).matches("(SG)?\\W(SG)?")){
					if(this.unassignedmodifier != null){ //did not see why the 2nd condition is needed. Here, assuming any unassigned modifier should be applied to the next valid chunk
						token = token.replaceFirst("\\[", "["+this.unassignedmodifier+" ");
						this.unassignedmodifier = null;
					}
					return (Chunk)cons.newInstance(token.trim());
				}else{//if the chunk is not correctly formatted. Forward pointer to the next comma.
					//forward pointer to after the next [;:,.]
					if(this.printExp){
						System.out.println("PP without a Noun: "+token);
					}
					pointer++;
					/*String t = "";
					do{
						if(this.pointer < this.chunkedtokens.size()){
							t = this.chunkedtokens.get(this.pointer++);
						}else{
							break;
						}
					}while (!t.matches("[,;:\\.]"));*/
					return null;
				}
			}catch(Exception e){
				e.printStackTrace();
			}
		}
		
		//OR:
		if(token.compareTo("or") == 0){
			this.pointer++;
			return new ChunkOR("or");
		}
		
		//text:
		chunk = composeChunk();
		
		return chunk;
	}
	
	
	@SuppressWarnings("rawtypes")
	private Chunk composeChunk() {
		Chunk chunk;
		String token;
		String scs = "";
		String role = "";
		boolean foundo = false;//found organ
		boolean founds = false;//found state
		if(this.unassignedmodifier != null){
			scs =(scs.trim().length()>0? scs.trim()+"] ": "")+"m["+this.unassignedmodifier.replaceAll("[{}]", "")+" ";
			this.unassignedmodifier = null;
		}
		int i = 0;
		for(i = this.pointer; i<this.chunkedtokens.size(); i++){
			token = this.chunkedtokens.get(i);
			/* if one of the tokens match those in the stop list but not in skip list, skip it and get the next token- mohan 10/19/2011*/
			if(token.matches("("+stop+")") && !token.matches("("+skip+")")){
				i=i+1;
				token = this.chunkedtokens.get(i);
			}
			/*end mohan 10/19/2011*/
			token = token.matches(".*?\\d.*")? NumericalHandler.originalNumForm(token):token;
			if(token.length()==0){
				continue;
			}
			//token = NumericalHandler.originalNumForm(token); //turn -LRB-/-LRB-2
			if(token.matches("^\\w+\\[.*")){ 
				//modifier +  a chunk: m[usually] n[size[{shorter}] constraint[than or {equaling} (phyllaries)]]
				//if(scs.matches("\\w{2,}\\[.*") && token.matches("\\w{2,}\\[.*")){ // scs: position[{adaxial}] token: pubescence[{pubescence~list~glabrous~or~villous}]
				if(scs.matches(".*?\\bo\\[\\w+\\s.*")){
					pointer = i;
					scs = scs.replaceAll("o\\[", "o[(").trim()+")]";
					return new ChunkNonSubjectOrgan("u["+scs+"]");
				}else if(scs.matches(".*?\\w{2,}\\[.*")){
					pointer = i;
					return new ChunkSimpleCharacterState("a["+scs.trim()+"]]"); 
				}else {
					String type = chunkType(i); //changed from pointer to i
					token = this.chunkedtokens.get(i);
					token = token.matches(".*?\\d.*")? NumericalHandler.originalNumForm(token):token;
					scs = scs.trim().length()>0? scs.trim()+"] " : ""; //modifier 
					String start = token.substring(0, token.indexOf("[")+1); //becomes n[m[usually] size[{shorter}] constraint[than or {equaling} (phyllaries)]]
					String end = token.substring(start.length());
					token = start+scs+end;
					try{
						if(type !=null){//r[p[as]] without o[]
							Class c = Class.forName("fna.charactermarkup."+type);
							Constructor cons = c.getConstructor(String.class);
							pointer = i+1;
							return (Chunk)cons.newInstance(token.trim());
						}else{ //parsing failure, continue with the next chunk
							pointer = i+1;
							return null;
						}
					}catch(Exception e){
						e.printStackTrace();
					}
				}
			}
			
			role = token.charAt(0)+"";
			token = token.replaceAll("[<>{}]", "");
			//<roots> {usually} <taproots> , {sometimes} {fibrous}.
			String symbol= this.rightAfterSubject? "type" : "o";
			if(!foundo && role.compareTo("<")==0){
				scs = (scs.trim().length()>0? scs.trim()+"] ": "")+symbol+"["+token+" ";
				foundo = true;
			}else if(foundo && role.compareTo("<")==0){
				scs += token+" ";
			}else if(foundo && role.compareTo("<") !=0){
				this.pointer = i;
				scs = scs.replaceFirst("^\\]\\s+", "").replaceFirst(symbol+"\\[", "###[").replaceAll("\\w+\\[", "m[").replaceAll("###\\[", symbol+"[").trim()+"]"; //change all non-type character to modifier: <Inflorescences> {indeterminate} <heads>
				if(!this.rightAfterSubject){
					//reformat m[] o[] o[] to m[] o[()] o[()]
					String m = scs.substring(0, scs.indexOf("o["));
					String o = scs.substring(scs.indexOf("o[")).replaceAll("\\[", "[(").replaceAll("\\]", ")]");
					scs = m+o;
				}
				return this.rightAfterSubject? new ChunkSimpleCharacterState("a["+scs+"]") : new ChunkNonSubjectOrgan("u["+scs+"]"); //must have type[ or o[
			}
			
			if(token.matches(".*?"+NumericalHandler.numberpattern+"$") || token.matches("\\d+\\+?") || token.matches("^to~\\d.*")){ //0. sentence ends with a number, the . is not separated by a space
				if(scs.matches(".*?\\w{2,}\\[.*")){//must have character[
					pointer=i;
					scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
					return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
				}else{
					pointer=i;
					chunk = getNextNumerics();
					if(chunk!=null){
						if(scs.length()>0){
							scs = scs.replaceFirst("^\\]", "").trim()+"] "+chunk.toString();
						}else{
							scs = chunk.toString();
						}
						chunk.setText(scs);
						return chunk;
					}else{
						pointer++;
						return chunk; //return null, skip this token: parsing failure
					}
				}
			}

			
			//add to a state chunk until a) a preposition b) a punct mark or c)another state is encountered
			if(role.compareTo("<") !=0 && true){
				String chara = Utilities.lookupCharacter(token, conn, characterhash, glosstable, tableprefix);
				if(chara==null && Utilities.isAdv(token, adverbs, notadverbs)){
					scs = scs.trim().length()>0? scs.trim()+ "] m["+token+" " : "m["+token;
				}else if(token.matches(".*[,;:\\.\\[].*") || token.matches("\\b("+ChunkedSentence.prepositions+"|or|and)\\b") || token.compareTo("-LRB-/-LRB-")==0){
					this.pointer = i;
					if(scs.matches(".*?\\w{2,}\\[.*")){//must have character[
						scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
						return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
					}else{
						if(scs.indexOf("m[")>=0){
							this.unassignedmodifier = "{"+scs.trim().replaceAll("(m\\[|\\])", "").replaceAll("\\s+", "} {")+"}";
						}
						if(this.pastpointers.contains(i+"")){
							this.pointer = i+1;
						}else{
							this.pastpointers.add(i+"");
						}
						//if(token.matches("SG.SG")) return new ChunkEOS("");
						return null;
					}
				}else{
					//String chara = TermOutputerUtilities.lookupCharacter(token, conn, characterhash, glosstable, tableprefix);
					if(!founds && chara!=null){
						scs = (scs.trim().length()>0? scs.trim()+"] ": "")+chara+"["+token+" ";
						founds = true;
						if(i+1==this.chunkedtokens.size()){ //reach the end of chunkedtokens
							scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
							this.pointer = i+1;
							return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
						}
					}else if(founds && chara!=null && scs.matches(".*?"+chara+"\\[.*")){ //coloration coloration: dark blue
						scs += token+" ";
					}else if(founds){
						//By Zilong
						/*orig:a[{more} ventrally] a[{directed}]*/
						/*should be:a[more ventrally directed] */
						
						if(scs.matches("^comparison\\[more\\]\\s+m\\[\\w+\\s+$")){
						//now it only handles the simplest case, only consider "more"	
							this.pointer = i+1;
							scs = scs.replaceFirst("comparison\\[", "m\\[");
							scs = scs.replaceFirst("\\] m\\[", " ").trim()+"] ";
							scs += chara+"["+token+" ";
						}else{
						//By Zilong End
							this.pointer = i;
						}
						scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";
						return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
						
					}else if(chara==null){
						if(Utilities.isVerb(token, verbs, notverbs) && !founds){//construct ChunkVP or ChunkCHPP
							scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"v["+token+" ";
							//continue searching for either a <> or a r[]
							boolean findc = false; //find a chunk
							boolean findo = false; //find an organ
							boolean findm = false; //find a modifier
							boolean findt = false; //find a text token
							for(int j = i+1; j < this.chunkedtokens.size(); j++){
								String t = this.chunkedtokens.get(j).trim();
								if(t.length() == 0){continue;}
								if(t.startsWith("u[")){//form a vb chunk
									t = t.replaceFirst("^u\\[", "").replaceFirst("\\]$", "");
									String o = t.substring(t.indexOf("o[")).trim();
									t = t.substring(0, t.indexOf("o[")).trim();
									if(t.length()>0){
										String[] states = t.split("\\s+");
										for(int k = 0; k < states.length; k++){
											String ch = Utilities.lookupCharacter(states[k], conn, characterhash, glosstable, tableprefix);
											if(ch!=null){
												scs = (scs.trim().length()>0? scs.trim()+"] ": "")+ch+"["+states[k].replaceAll("[{}]", "")+" ";
											}else{
												scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+states[k].replaceAll("[{}]", "")+" ";
											}
										}		
									}
									scs = (scs.trim().length()>0? scs.trim()+"] ": "")+o;
									this.pointer = j+1;
									return new ChunkVP("b["+scs+"]"); 
								}
								String ch = Utilities.lookupCharacter(t, conn, characterhash, glosstable, tableprefix);
								if((!findc &&!findo) && t.matches("^[rwl]\\[.*")){
									scs = scs.replaceFirst("^\\]\\s+", "").trim()+"] ";
									scs += t;
									findc = true;
								}else if(!findo && t.indexOf("<")>=0){
									scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"o["+t.replace("<", "(").replace(">", ")").replaceAll("[{}]", "")+" ";
									findo = true;
								}else if(!findo && !findc && ch!=null){
									scs = (scs.trim().length()>0? scs.trim()+"] ": "")+ch+"["+t.replaceAll("[{}]", "")+" ";
								}else if(!findo && !findc && !findm && Utilities.isAdv(t, adverbs, notadverbs)){
									scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+t.replaceAll("[{}]", "")+" ";
									findm = true;
								}else if(!findo && !findc && findm && Utilities.isAdv(t, adverbs, notadverbs)){
									scs += t.replaceAll("[{}]", "")+" ";
								}else if(findo && t.indexOf("<")>=0){
									scs += t.replace("<", "(").replace(">", ")").replaceAll("[{}]", "")+" ";
								}else if((findo || findc) && t.indexOf("<")<0){ //must have foundo or foundc
									this.pointer = j;
									if(findo){scs = scs.replaceFirst("^\\]\\s+", "").trim()+"]";}
									if(scs.indexOf("p[")>=0){
										return new ChunkCHPP("t["+scs.replace("v[", "c[")+"]");
									}else{
										scs = scs.replace("l[", "o[");
										if(scs.matches(".*?\\bv\\[[^\\[]* m\\[.*")){//v[comprising] m[a] architecture[surrounding] o[(involucre)]
											scs = format(scs);
											//scs = scs.replaceFirst("\\] o\\[", " ").replaceFirst("\\] m\\[", "] o[");
										}else if(scs.matches(".*?\\bv\\[[^\\[]* \\w{2,}\\[.*")){//v[comprising]  architecture[surrounding]
											scs = format(scs);
											//scs = scs.replaceFirst("\\] o\\[", " ").replaceFirst("\\] \\w{2,}\\[", "] o[");
										}
										return new ChunkVP("b["+scs+"]"); 
									}
								}else if(t.matches(".*?\\W.*") || t.matches("\\b("+ChunkedSentence.prepositions+"|or|and)\\b") || t.compareTo("-LRB-/-LRB-")==0){
									if(scs.matches(".*?\\w{2,}\\[.*")){ //borne {singly
										this.pointer = j;
										scs = (scs.replaceFirst("^\\]", "").trim()+"]").replaceFirst("\\bv\\[[^\\[]*?\\]\\s*", "");
										return new ChunkSimpleCharacterState("a["+scs.trim()+"]");
									}else{
										//search failed
										if(this.pastpointers.contains(i+"")){
											this.pointer = i+1;
										}else{
											this.pointer = i;
											this.pastpointers.add(i+"");
										}
										return null;
									}
								}else if(!findt){ //usually v[comprising] m[a {surrounding}] o[involucre]
									scs = (scs.trim().length()>0? scs.trim()+"] ": "")+"m["+t+" "; //taking modifiers
									findt = true;
								}else if(findt){
									scs += t+" ";
								}
							}
						}else{
							scs = "";
						}
					}
				}
			}
		}
		if(i==this.chunkedtokens.size()){
			this.pointer = this.chunkedtokens.size();
		}
		return null;
	}
	

	/**
	 * 
	 * @return e.g. 3 cm, what about "3 cm to 10 dm"?
	 * also 3 times (... longer than, as wide as ...)
	 */
	/*private Chunk getNextBroken() {
		String result = "";
		String type = "";
		boolean found = false;
		for(int i = pointer; i<this.chunkedtokens.size(); i++){
			if(this.chunkedtokens.get(i).matches(".*?-")){ //ends with a hyphen
				result += this.chunkedtokens.get(i)+ " ";
				found = true;
				type = checkType(i);
			}
			if(found){
				result += this.chunkedtokens.get(i)+ " ";
				pointer = i+1;
				try{
					if(type != null){
						Class c = Class.forName(type);
						Constructor cons = c.getConstructor(String.class);
						return (Chunk)cons.newInstance(result.replaceAll("[<>]", "").trim());
					}else{
						return new SimpleCharacterState(result.replaceAll("[<>]", "").trim());
					}
				}catch(Exception e){
					e.printStackTrace();
				}
			}
		}
		return null;
	}*/
	/**
	 * m[usually] v[comprising] m[a] architecture[surrounding] o[(involucre)]
	 * 
	 * m[usually] v[comprising] o[1 architecture[surrounding] (involucre)]
	 */
	private String format(String scs) {
		String first = scs.substring(0, scs.indexOf("v["));
		String rest = scs.replace(first, "");
		String v = rest.substring(0, rest.indexOf(']')+1+0);
		String o = rest.replace(v, "").trim(); //m[a] architecture[surrounding] o[(involucre)]
		String newo = "o[";
		do{
			String t = o.indexOf(' ')>=0? o.substring(0, o.indexOf(' ')) : o;
			o = o.replaceFirst(t.replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]").replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)"),"").trim();
			if(t.startsWith("m[")){
				t = t.replaceAll("(m\\[|\\])", "").trim();
				if(t.compareTo("a") == 0 && !o.matches("(couple|few)")){
					t = "1";
				}
			}
			if(t.startsWith("o[")){
				t=t.replaceAll("(o\\[|\\])", "").trim();
			}
			newo+=t+" ";
			
		}while(o.length()>0);
		return first+v+" "+newo.trim()+"]";
	}
	/**
	 * TODO: deal with LRB-/-LRB
	 * @return e.g. 3 cm, what about "3 cm to 10 dm"?
	 * also 3 times (... longer than, as wide as ...)
	 */
	private Chunk getNextNumerics() {
		String numerics = "";
		String t = this.chunkedtokens.get(this.pointer);
		t = NumericalHandler.originalNumForm(t).replaceAll("\\?", "");		
		if(t.matches("^to~\\d.*")){
			this.pointer++;
			return new ChunkValue(t.replaceAll("~", " ").trim());
		}
		/*if(t.matches(".*?("+ChunkedSentence.percentage+")")){ //10percent won't work because it won't be seen as a numerical value in the first place
			numerics += t+ " ";
			pointer++;
			return new ChunkValuePercentage(numerics.trim());
		}
		if(t.matches(".*?("+ChunkedSentence.degree+")")){
			numerics += t+ " ";
			pointer++;
			return new ChunkValueDegree(numerics.trim());
		}*/
		if(t.matches(".*?[()\\[\\]\\-\\�\\d\\.�\\+���/�\\*/%]*?[�/�\\d][()\\[\\]\\-\\�\\d\\.�\\+���/�\\*/%]*(-\\s*("+ChunkedSentence.counts+")\\b|$)")){ //ends with a number
			numerics += t+ " ";
			pointer++;
			if(pointer==this.chunkedtokens.size()){
				return new ChunkCount(numerics.replaceAll("[{()}]", "").trim());
			}
			t = this.chunkedtokens.get(this.pointer);//read next token
			if(t.matches("^[{<(]*("+ChunkedSentence.percentage+").*")){
				numerics += t+ " ";
				pointer++;
				return new ChunkValuePercentage(numerics.replaceAll("[{(<>)}]", "").trim());
			}
			if(t.matches("^[{<(]*("+ChunkedSentence.degree+")\\b.*")){
				numerics += t+ " ";
				pointer++;
				return new ChunkValueDegree(numerics.replaceAll("[{(<>)}]", "").trim());
			}
			if(t.matches("^[{<(]*("+ChunkedSentence.units+")\\b.*?")){
				numerics += t+ " ";
				pointer++;
				adjustPointer4Dot(pointer);//in bhl, 10 cm . long, should skip the ". long" after the unit
				numerics = numerics.replaceAll("[{(<>)}]", "").trim();
				if(numerics.contains("�")){
					return new ChunkArea(numerics);
				}
				return new ChunkValue(numerics);
			}
			if(t.matches("^[{<(]*("+ChunkedSentence.times+")\\b.*?")){
				numerics += t+ " ";
				pointer++;
				numerics = numerics.replaceAll("[{(<>)}]", "");
				String size = numerics.trim();
				Chunk c = nextChunk();
				while(c.toString().contains("character")){
					numerics +=c.toString().replaceAll("(\\w+\\[|\\])", "")+" ";
					c = nextChunk();
				}
				numerics +=c.toString();
				if(c instanceof ChunkTHAN){
					return new ChunkTHAN(numerics.replaceFirst(size, "size["+size+"]"));
				}else{
				//if(c instanceof ChunkTHANC){
				//	return new ChunkValue(numerics);//1.5-2 times n[size[{longer} than {wide}]]
				//}else{	
					return new ChunkComparativeValue(numerics);//1-2 times a[shape[divided]]???; 1-2 times shape[{shape~list~pinnately~lobed~or~dissected}];many 2-4[-6+] times a[size[widths]];[0.5-]1.5-4.5 times u[o[(leaves)]];0.4-0.5 times u[o[(diams)]]
				}
			}
			/*if(found && this.chunkedtokens.get(i).matches("^("+this.per+")\\b.*?")){
				numerics += this.chunkedtokens.get(i)+ " ";
				pointer = i+1;
				return new ChunkBasedCount(numerics.replaceAll("[<>]", "").trim());
			}*/
			return new ChunkCount(numerics.replaceAll("[{()}]", "").trim());
		}
		//l/w: length/width
		if(t.matches("l\\s*\\W\\s*w")){
			while(!t.matches(".*?\\d.*")){
				t = this.chunkedtokens.get(++this.pointer)+" ";
			}
			this.pointer++;
			String next = this.chunkedtokens.get(this.pointer);//read next token
			if(next.matches("^[{<(]*("+ChunkedSentence.percentage+").*")){
				t += next.replaceAll("[{<()>}]", "")+ " ";
				pointer++;
			}
			return new ChunkRatio(NumericalHandler.originalNumForm(t).trim(), "length/width");
		}
		//h/w:height/width
		if(t.matches("h\\s*\\W\\s*w")){
			while(!t.matches(".*?\\d.*")){
				t = this.chunkedtokens.get(++this.pointer)+" ";
			}
			this.pointer++;
			String next = this.chunkedtokens.get(this.pointer);//read next token
			if(next.matches("^[{<(]*("+ChunkedSentence.percentage+").*")){
				t += next.replaceAll("[{<()>}]", "")+ " ";
				pointer++;
			}
			return new ChunkRatio(NumericalHandler.originalNumForm(t).trim(), "height/width");
		}
		return null;
	}
	
	
	/**
	 * needed for cases like "10 cm . long/broad/wide/thick", skip ". "
	 * @param pointer2
	 */
	private void adjustPointer4Dot(int pointer) {
		//boolean iscase = false;
		while(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().length()==0){
			pointer++;
		}
		if(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().matches("\\.")){//optional
			this.pointer++;
		}
		
		/*while(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().length()==0){
			pointer++;
		}
		while(this.chunkedtokens.size()>pointer && this.chunkedtokens.get(pointer).trim().matches("[{(<]?(long|broad|wide|thick)[})>]?")){//required
			pointer++;
			iscase = true;
		}
		if(iscase){
			this.pointer = pointer;
		}*/
	}
	/**
	 * 
	 * @return e.g. z[m[leaf] e[blade]], apex, 
	 * margins and apexes
	 * {} <> <>
	 * {} ()
	 */
	public Chunk getNextOrgan() {
		String organ = "";
		boolean found = false;
		int i = 0;
		for(i = pointer; i<this.chunkedtokens.size(); i++){
			String token = this.chunkedtokens.get(i);
			if(token.startsWith("z[")){
				pointer++;
				return new ChunkOrgan(token);
			}
			if(token.startsWith("l[")){
				pointer++;
				return new ChunkNPList(token);
			}
			if(token.startsWith("u[")){
				pointer++;
				return new ChunkNonSubjectOrgan(token);
			}
			
			if(token.matches(".*?\\b("+ChunkedSentence.prepositions+")\\b.*") || token.matches(".*?[,;:\\.].*")){
				break;
			}
			if(found && token.matches("\\b(and|or)\\b")){
				found = false;
			}
			if(found && !token.matches(".*?[>)]\\]*$")){
				pointer = i;
				if(organ.matches("^\\w+\\[")){
					organ = organ.replaceAll("(\\w+\\[|\\])", "");
				}
				organ = organ.replaceAll("[<(]", "(").replaceAll("[>)]", ")").trim();
				return new ChunkOrgan("z["+organ+"]");
			}
			organ += token+" ";
			if(token.matches(".*?[>)]\\]*$")){
				found = true;
			}
			
		}
		if(found){
			pointer = i;
			if(organ.matches("^\\w+\\[")){
				organ = organ.replaceAll("(\\w+\\[|\\])", "");
			}
			organ = organ.replaceAll("[<(]", "(").replaceAll("[>)]", ")").trim();
			return new ChunkOrgan("z["+organ+"]");
		}
		
		return null;
	}
	
	/**
	 * use the un-collapsedTree (this.tree) to check the type of a chunk with the id, 
	 * @param i
	 * @return: 

SBAR: s
VP: b[v/o]
PP: r[p/o]
VP-PP: t[c/r[p/o]]
ADJ-PP:t[c/r[p/o]]
Than: n
To: w
NPList: l
PPList: i
main subject: z[m/e]
non-subject organ/structure u[m[] relief[] o[]]
character modifier: a[m[largely] relief[smooth] m[abaxially]]

	 */
	private String chunkType(int id) {
		String token = this.chunkedtokens.get(id);
		if(token.matches("^\\w{2,}\\[.*")){
			return "ChunkSL"; //state list
		}
		/*if(token.startsWith("q[")){
			return "ChunkQP";
		}*/
		/*if(token.startsWith("s[")){
			return "ChunkSBAR";
		}*/
		if(token.startsWith("b[")){//z[{longitudinal} (ridge)] b[v[{running}] o[the {length}]] r[p[of] o[the ({quadrate})]] laterally . 
			if(token.matches(".*\\)\\]+")){
				return "ChunkVP";
			}else if(token.indexOf(" o[")<0){//turn it into a simple character chunk, for example "*meet* posteriorly"
				token = token.replaceAll("([bv]\\[|\\]|\\{|\\})", "").trim();
				Utilities.insert2TermCategoryTable(token, "feature", conn, this.tableprefix);
				token = "a[feature["+token+"]]";
				this.chunkedtokens.set(id, token);				
				return "ChunkSimpleCharacterState";
			}else{//z[{longitudinal} (ridge)] b[v[{running}] o[the {length}]] r[p[of] o[the ({quadrate})]] laterally . 
			
				String nexttoken = "";
				int i = id+1;
				while(nexttoken.length()==0 && i < this.chunkedtokens.size()){
					nexttoken = this.chunkedtokens.get(i++);
				}
				if(nexttoken.matches("r\\[p.*?o\\[.*?\\)\\]+")){//merge
					token = token.replaceFirst("\\] o\\[", " ").replaceFirst("\\]+", "").replaceAll("\\s+", " ");
					nexttoken = nexttoken.replaceFirst("r\\[p\\[", "");
					this.chunkedtokens.set(id, token+" "+nexttoken);
					this.chunkedtokens.set(i-1, "");
					return "ChunkVP";
				}
				return "ChunkVP"; //return positively anyway
			}
		}
		//if(token.startsWith("r[") && token.indexOf("[of]") >= 0){
		//	return "ChunkOf";
		//}
		if(token.startsWith("r[")){
			if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?.*("+ChunkedSentence.degree+")[}>)]?\\]+")){
				token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "").replaceAll("[<{()}>]", "");
				this.chunkedtokens.set(id, token);
				return "ChunkValueDegree";
			}else if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?.*("+ChunkedSentence.percentage+")[}>)]?\\]+")){
				token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "").replaceAll("[<{()}>]", "");
				this.chunkedtokens.set(id, token);
				return "ChunkValuePercentage";
			}else
			//r[p[around] o[10 mm]] should be ChunkValue
			if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?.*("+ChunkedSentence.units+")\\]+")){
				token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "");
				this.chunkedtokens.set(id, token);
				return "ChunkValue";
			}else if(token.matches(".* o\\[\\(?[0-9+�x���/�*/%-]+\\)?\\]+") && !token.matches(".*[�x]\\].*")){//r[p[at] o[30�]] is not a value
				token = token.replaceFirst("\\[p\\[", "[m[").replaceAll("[or]\\[", "").replaceFirst("\\]+$", "");
				this.chunkedtokens.set(id, token);
				return "ChunkValue";
			}else if(token.indexOf("o[")>=0 /*|| token.indexOf("c[")>=0*/){
				//r[p[without] o[or r[p[with] o[{poorly} {developed} {glutinous} ({ridge})]]]] ; 
				token = token.replaceAll("r\\[p\\[of\\]\\]", "of");
				this.chunkedtokens.set(id, token);
				//r[p[for] o[{dorsal} 12 , {form}]] SG.SG
				if(token.matches(".*? o\\[.*?, \\{\\w+\\}\\]+") && id >= this.chunkedtokens.size()-2){
					token = token.replaceFirst(", \\{\\w+\\}(?=\\]{1,3})","");
					this.chunkedtokens.set(id, token);
				}
				//nested preps
				if(token.matches(".*?\\[p\\[\\w+\\] o\\[\\w+ r\\[p\\[.*")){
					Pattern p = Pattern.compile("(.*?\\[p\\[\\w+)(\\] o\\[)(\\w+ )(r\\[p\\[)(.*)");
					Matcher m = p.matcher(token);
					if(m.matches()){
						token = m.group(1)+" "+m.group(3)+m.group(5).replaceFirst("\\]\\]\\s*$", "");
						this.chunkedtokens.set(id, token);
					}					
				}
				return "ChunkPrep";
			}else if(token.indexOf("-as")>0 && !token.startsWith("n[")){//as-wide-as, same-width-as:r[p[{same-width-distally-as}]]
				//a[intensity_level_or_thickness[thin]]
				//repack as ChunkSimpleCharacterState
				token = token.substring(token.lastIndexOf("[")+1, token.indexOf("]")).replaceAll("[{}]", ""); //same-width-distally-as
				String charword = token.replaceFirst(".*?-", "").replaceFirst("-.*", "");
				String chara = Utilities.lookupCharacter(charword, this.conn, ChunkedSentence.characterhash, glosstable, tableprefix);
				if(chara==null) return null;
				else{
					token = token.replace("-", " ");
					String nexttoken = this.chunkedtokens.get(id+1);
					if(nexttoken.indexOf("[")<0){
						token = "a["+chara+"["+token+" "+nexttoken+"]]";
						this.chunkedtokens.set(id+1, "");
					}else token = "a["+chara+"["+token+"]]";
					this.chunkedtokens.set(id, token);
					return "ChunkSimpleCharacterState";
				}
				
			}else{
				return null;
			}
		}
		if(token.startsWith("t[")){
			//this was for FNAv19, but it seemed all t[ chunks were only generated by composeChunk, bypassing this step. t[ chunks generated by chunking does not seem to need this reformatting.
			//reformat c[] in t[]: c: {loosely} {arachnoid} : should be m[loosely] architecture[arachnoid]
			/*Pattern p = Pattern.compile("(.*?\\b)c\\[([^]].*?)\\](.*)");
			Matcher m = p.matcher(token);
			String reformed = "";
			if(m.matches()){
				reformed += m.group(1);
				String c = reformCharacterState(m.group(2));
				reformed += c+ m.group(3);
			}
			this.chunkedtokens.set(id, reformed);*/
			return "ChunkCHPP"; //character/state-pp
		}
		if(token.startsWith("n[")){//returns three different types of ChunkTHAN
			//n[{equal-to} or {greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]]
			//n[{greater} than or {equal-to} {depth} r[p[of] o[{adjacent} (prearticular)]]]
			//n[{as-long-as} or {greater} than {depth} r[p[of] o[{adjacent} (prearticular)]]]
			//n[{as-long-as} {depth} r[p[of] o[{adjacent} (prearticular)]]]
			String beforethan = "";
			String charword= "";
			String beforechar = "";
			String afterthan = "";
			String chara = null;
			String keyword = ""; //than, as long as, etc.
			if(token.indexOf(" or ")>0 || token.startsWith("or ")){
				//find 
				Pattern p = Pattern.compile("(\\bor\\b.*?\\b(?:than|to|as)\\b)"); //equal-to, same as, or same-as
				Matcher m = p.matcher(token);
				m.find();
				keyword = "than"; // if "than" is part of " or " conjunction, then keyword is default to "than"
				beforethan = token.substring(0, m.start()+m.group(1).length()+1).trim(); //including 'than': {equal-to} or {greater} than |  {greater} than or {equal-to}
				afterthan = token.substring(m.start()+m.group(1).length()+1).trim(); //anything follows before than
				String temp = "";
				if(beforethan.indexOf(" than ")>0)
					temp = beforethan.substring(0, beforethan.indexOf(" than ")).trim();
				if(beforethan.endsWith(" than"))
					temp = beforethan.substring(0, beforethan.length()-4).trim();	
				if(temp.length()>0){
					charword = temp.substring(temp.lastIndexOf(" ")>0? temp.lastIndexOf(" ") : temp.length()).trim(); //word before "than"
					beforechar = "";
				}
				if(!charword.matches("("+ChunkedSentence.more+")")){
					chara = Utilities.lookupCharacter(charword, this.conn, ChunkedSentence.characterhash, glosstable, tableprefix);
				}
				charword = beforethan.replaceFirst("n\\[", "").trim(); //make sure not lose 'equal to' before "greater than'
			}else{
				token = token.replaceAll("�", " degrees"); //� and % don't work well with \b in reg exp.
				token = token.replaceAll("%", " percent");
				if(token.matches(".*?as-.*?-as.*")){ //as-long-as case
					Pattern p = Pattern.compile("(\\{?as-(?:"+ChunkedSentence.asasthan+")-as\\}?)");
					Matcher m = p.matcher(token);
					m.find();
					keyword = m.group(1).replaceAll("[{}]", "").replaceAll("-", " ");
					beforethan = token.substring(0, m.start()).trim().replaceFirst("n\\[", ""); //not including 'than'
					afterthan = token.substring(m.start()+m.group(1).length()+1).trim();
					charword = keyword.replaceAll("(^as | as$)", "").trim();
					keyword = ""; //reset to "" as it is not needed in the final chunk
					beforechar = beforethan;
				}else{
					Pattern p = Pattern.compile("\\b(than)\\b");
					Matcher m = p.matcher(token);
					m.find();
					keyword = m.group(1);
					beforethan = token.substring(0, m.start()).trim(); //not including 'than'
					afterthan = token.substring(m.start()+m.group(1).length()+1).trim();
					charword = beforethan.lastIndexOf(' ')>0 ? beforethan.substring(beforethan.lastIndexOf(' ')+1) : beforethan.replaceFirst("n\\[", "");
					beforechar = beforethan.replace(charword, "").trim().replaceFirst("n\\[", "");
				}			

				if(!charword.matches("("+ChunkedSentence.more+")")){
					chara = Utilities.lookupCharacter(charword, this.conn, ChunkedSentence.characterhash, glosstable, tableprefix);
				}
				//afterthan = token.substring(token.indexOf(" than ")+6);
			}
			
			if(afterthan.indexOf(" than ")>0){//2nd than in the token
				//'more than'... 2 times {longer} than {wide}]
				String cp = afterthan;
				afterthan = afterthan.replaceFirst(" than ", " constraint[than ")+"]";
				token = token.replace(cp, afterthan);
			}
			//Case B: compared to numerical values
			if(afterthan.matches(".*?\\d.*?\\b("+ChunkedSentence.units+"|"+ChunkedSentence.percentage+"|"+ChunkedSentence.size+")\\b.*") || afterthan.matches(".*?(\\d\\.\\d|%).*")){// "n[{longer} than 3 (cm)]" => n[size[{longer} than 3 (cm)]]
				//'%\b' won't match '%'
				if(chara==null){chara="size";}
				//n[more than 4 times {maximum} {width}]=> put {width} part in constraint
				//don't add another constraint in n[2 times {longer} constraint[than {wide}]]
				if(afterthan.indexOf(" constraint[")<0 && afterthan.matches(".*?\\d.*?\\b("+ChunkedSentence.size+")\\b.*")){
					String sizechara = afterthan.replaceFirst(".*?\\d.*?\\b("+ChunkedSentence.times+"|"+ChunkedSentence.percentage+") (?=[^\\d]+\\b("+ChunkedSentence.size+")\\b)", "");
					String escaped = sizechara.replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}").replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]");
					token = token.replaceFirst(escaped+"$", "constraint["+sizechara+"]");
				}
				token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
				this.chunkedtokens.set(id, token);
				return "ChunkTHAN"; //character
			}else if(afterthan.matches(".*?.*?\\d.*?\\b("+ChunkedSentence.degree+")\\b.*") || afterthan.matches(".*?\\d\\.\\d.*")){// "n[{longer} than 3 (cm)]" => n[size[{longer} than 3 (cm)]]
				if(chara==null){chara="orientation";}
				token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
				this.chunkedtokens.set(id, token);
				return "ChunkTHAN"; //character
			}
			else if(afterthan.matches(".*?.*?\\b\\d\\b.*")){// "teeth more than 20"
				if(chara==null){chara="count";}
				token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
				this.chunkedtokens.set(id, token);
				return "ChunkTHAN";
			}//Case C: compared to organs
			else if(afterthan.indexOf("(")>=0){ //contains organ
				if(chara==null){//is a constraint, lobed n[more than...]
					token = "n["+token.replaceFirst("n\\[", "constraint[")+"]";
					this.chunkedtokens.set(id, token);
					return "ChunkTHAN";
				}else{//n[more deeply lobed than...
					token = "n["+(beforechar.length()>0? "m["+beforechar+"] ": "")+chara+"["+charword+"] constraint["+keyword+" "+afterthan+"]";
					this.chunkedtokens.set(id, token);
					return "ChunkTHAN";
				}
			}//Case A n[wider than long]: compare among characters
			else{
				token = "n["+(beforechar.length()>0? "m["+beforechar+"] ": "")+chara+"["+charword+"] constraint["+keyword+" "+afterthan+"]";
				//token = "n["+token.replaceFirst("n\\[", chara+"[")+"]";
				this.chunkedtokens.set(id, token);
				//return "ChunkTHANC"; //character
				return "ChunkTHAN";
			}
		}
		if(token.startsWith("w[")){//w[{proximal} to the (florets)] ; or w[to (midvine)]
			//reformat it to CHPP
			if(token.indexOf("w[to ")>=0){
				token = token.replaceFirst("w\\[to ", "r[p[to] o[")+"]";
				this.chunkedtokens.set(id, token);
				return "ChunkPrep";
			}else{
				token = token.replaceFirst("w\\[","t[c[").replaceFirst("(\\s+|\\b)to\\s+", "] r[p[to] o[")+"]]";
				this.chunkedtokens.set(id, token);
				return "ChunkCHPP";
			}
			
		}
		if(token.startsWith("l[")){
			return "ChunkNPList";
		}
		if(token.startsWith("i[")){
			return "ChunkPPList";
		}
		if(token.startsWith("z[")){
			return "ChunkOrgan";
		}
		if(token.startsWith("u[")){
			return "ChunkNonSubjectOrgan";
		}
		return null;
	}

	/**
	 * 
	 * @param group: {loosely} {arachnoid}
	 * @return:m[loosely] architecture[arachnoid]
	 */
	@SuppressWarnings("unused")
	private String reformCharacterState(String charstring) {
		String result = "";
		String first = "";
		String last = "";
		if(charstring.lastIndexOf(' ')>=0){
			last = charstring.substring(charstring.lastIndexOf(' ')).trim();
			first = charstring.replace(last, "").trim();
			result = "m["+first+"] ";
		}else{
			last = charstring.trim();
		}
			
		String c = Utilities.lookupCharacter(last, conn, characterhash, glosstable, tableprefix);
		if(c!=null){
			result += c+"["+last+"]";
		}else if(Utilities.isVerb(last, verbs, notverbs)){
			result += "v["+last+"]";
		}
	
		return result.trim();
	}
	
	/**
	 * when parsing fails at certain point, forward the pointer to the next comma
	 */
	public void setPointer2NextComma() {
		for(; this.pointer<this.chunkedtokens.size(); pointer++){
			if(this.chunkedtokens.get(pointer).matches("(,|\\.|;|:)")){
				break;
			}
		}
		
	}
	public String getText(){
		try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select originalsent from "+this.tableprefix+"_sentence where source ='"+sentsrc+"'");
			if(rs.next()){
				this.text = rs.getString(1); //has to use originalsent, because it is "ditto"-fixed (in SentenceOrganStateMarker.java) and perserve capitalization for measurements markup
			}
		}catch(Exception e){
			e.printStackTrace();
		}
		return this.text;
	}
	
	public String getSubjectText(){
		return this.subjecttext;
	}

	
	/*
	private void findSubject(){
		String senttag = null;
		String sentmod = null;
		String text = null;
		String taggedtext = null;
		//boolean islifestyle = false;//make this a post-process
		try{
			Statement stmt = conn.createStatement();
			//ResultSet rs = stmt.executeQuery("select modifier, tag, originalsent from "+this.tableprefix+"_sentence where source ='"+sentsrc+"'");
			ResultSet rs = stmt.executeQuery("select modifier, tag, originalsent from "+this.tableprefix+"_sentence where source ='"+sentsrc+"'");
			if(rs.next()){
				senttag = rs.getString(2).trim();
				senttag = senttag.compareTo("general")==0? "ApplicationUtilities.getProperty("unknown.structure.name")" : senttag;
				sentmod = rs.getString(1).trim();
				this.text = rs.getString(3); //has to use originalsent, because it is "ditto"-fixed (in SentenceOrganStateMarker.java) and perserve capitalization for measurements markup
			}
			rs = stmt.executeQuery("select rmarkedsent from "+this.tableprefix+"_markedsentence where source ='"+sentsrc+"'");
			if(rs.next()){
				taggedtext = rs.getString(1).trim();
				text = taggedtext.replaceAll("[{}<>]", "").trim();
			}

		}
		catch(Exception e){
			e.printStackTrace();
		}
		
		if(senttag.compareTo("ignore")!=0){
			//sentence subject
			if(senttag.compareTo("ApplicationUtilities.getProperty("unknown.structure.name")")==0){
				this.subjecttext = "(ApplicationUtilities.getProperty("unknown.structure.name"))";
			}else if(senttag.compareTo("chromosome")==0){
				this.subjecttext = "(chromosome)";
				skipLead("chromosome".split("\\s"));
			}else if(senttag.compareTo("ditto")!=0 && senttag.length()>0){
				//find the subject segment
				String subject = "";
				String [] tokens = text.split("\\s+");
				if(senttag.indexOf("[")<0){ 
					if(senttag.matches(".*\\b(or|and|plus)\\b.*")){// a , c, and/or b
						int or = senttag.lastIndexOf(" or ");
						int and = senttag.lastIndexOf(" and ");
						int ind = or < and ? and : or;
						int plus = senttag.lastIndexOf(" plus ");
						ind = plus < ind ? ind : plus;
						String seg = senttag.substring(ind).replaceAll("oo", "(oo|ee)").trim();// and/or b
						if(seg.indexOf("(oo|ee)")>=0){
							seg =seg.replaceFirst(".$", "\\\\w+\\\\b");
						}else if(seg.length() < 5){
							seg =seg.replaceFirst("..$", "\\\\w+\\\\b");
						}else{
							seg = seg.replaceFirst("...$", "\\\\w+\\\\b");
						}
						//seg = seg.replaceFirst("(and|or) ", "(and|or|plus|,) .*?");
						seg = seg.replaceFirst("(and|or) ", "(\\\\band\\\\b|\\\\bor\\\\b|\\\\bplus\\\\b|,).*?\\\\b");
						//tag derived from complex text expression: "biennial or short_lived perennial" from "iennials or short-lived , usually monocarpic perennials ,"
						seg = seg.replaceAll("(?<=\\W)\\s+(?=\\W)", ".*?")
						.replaceAll("(?<=\\W)\\s+(?=\\w)", ".*?\\\\b")
						.replaceAll("(?<=\\w)\\s+(?=\\W)", "\\\\b.*?")
						.replaceAll("(?<=\\w)\\s+(?=\\w)", "\\\\b.*?\\\\b");
						Pattern p = Pattern.compile("(^.*?"+seg+")");
						Matcher m = p.matcher(text.replaceAll("\\s*-\\s*", "_"));
						if(m.find()){
							subject = m.group(1);
							subject = subject.replaceAll("\\s+-\\s+", "-");
							if(skipLead(subject.split("\\s+"))<0){
								this.subjecttext = null;
							}else{
								String organs = senttag.replaceAll("\\w+\\s+(?!(and |or |plus |$))", "|").replaceAll("\\s*\\|\\s*", "|").replaceAll("(^\\||\\|$)", "").replaceAll("\\|+", "|");//o1|o2
								//turn organ names in subject to singular
								String[] stokens = subject.split("\\s+");
								subject = "";
								for(int i = 0; i < stokens.length; i++){
									String singular = TermOutputerUtilities.toSingular(stokens[i]);
									if(singular.matches("("+organs+")")){
										stokens[i] = singular;
									}
									subject += stokens[i]+" ";
								}
								subject = formatSubject(subject, taggedtext);
								//subject = subject.trim().replaceAll("(?<=\\b("+organs+")\\b) ", ") ").replaceAll(" (?=\\b("+organs+")\\b)", " (").replaceFirst("(?<=\\b("+organs+")\\b)$", ")").replaceFirst("^(?=\\b("+organs+")\\b)", "(").trim();
								//subject = subject.replaceAll("(?<=\\w) ", "} ").replaceAll(" (?=\\w)", " {").replaceAll("(?<=\\w)$", "}").replaceAll("^(?=\\w)", "{").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
								this.subjecttext = subject;
							}
						}	
						
					}else{
						for(int i = 0; i<tokens.length; i++){
							if(TermOutputerUtilities.toSingular(tokens[i]).compareTo(senttag.replaceAll("_", ""))==0){
								subject = subject.replaceAll("\\s+-\\s+", "-");
								subject += tokens[i]+ " ";
								//subject = "{"+subject.trim().replaceAll("[\\[\\]{}()]", "").replaceAll(" ", "} {")+"}";
								//subject = (subject + " ("+tokens[i].replaceAll("[\\[\\]]", "").replaceAll(" ", ") (")+")").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
								//this.subjecttext = addSentmod(subject, sentmod); not used in phenoscape annotation
								this.subjecttext=formatSubject(subject.trim(), taggedtext);
								if(subject.length()>0){
									//skipLead(subject.replaceAll("[\\[\\]{}()]", "").split("\\s+"));
									int skip = skipLead(subject.split("\\s+"));
									if(skip==-1) this.subjecttext=null; //subject search failed.
									break;
								}
							}else{
								subject += tokens[i]+" ";
							}
						}
					}
				}else if(senttag.indexOf("[")>=0){// must not be of-case
					subject = ("{"+sentmod.replaceAll("[\\[\\]]", "").replaceAll(" ", "} {")+"} ("+senttag.replaceAll("[\\[\\]]", "").replaceAll(" ", ") (")+")").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
					this.subjecttext=subject;
					String mt = (sentmod+" "+senttag).replaceAll("\\[+.+?\\]+", "").replaceAll("\\s+", " ").trim();
					if(mt.length()>0)
						skipLead(mt.split("\\s+"));
				}

			}else if(senttag.compareTo("ditto")==0){
				if(sentsrc.endsWith("0")){
					this.subjecttext ="(ApplicationUtilities.getProperty("unknown.structure.name"))";//it is a starting sentence in a treatment, without an explicit subject.
				}else{
					this.subjecttext ="ditto";
					//mohan code :10/28/2011. If the subject is ditto and the first chunk is a preposition chunk make the subject empty so that it can search within the same sentence for the subject.
					int j=0;
					String text1 = "";
					for(j=0;j<this.chunkedtokens.size();j++)
					{
					text1 = "";
					text1 += this.chunkedtokens.get(j);//gets the first token to check if its a preposition
					if(text1.compareTo("")!=0)
					{
						break;
					}
					}if(text1.matches("r\\[p\\[.*\\]")){
						int i=0;
						for(i=0;i<this.chunkedtokens.size();i++)
						{
							String text2="";
							text2+=this.chunkedtokens.get(i);
							if(text2.matches("(\\<.*\\>)"))
							{
								this.subjecttext =null;
								break;
							}
						}
						
					}
					//End of mohan//
				}
			}
		}else{
			if(this.text.matches(".*?[A-Z]{2,}.*")){ //this.text must be originalsent where captalization is perserved.
				this.subjecttext = "measurements";
			}else{
				this.subjecttext = "ignore";
			}
		}
		if(this.subjecttext!=null && this.subjecttext.endsWith("}")){
			this.subjecttext = null;
			this.pointer = 0;
		}
	}*/
	
	/**
	 * manual digit 
	 * => (manual) (digit) or {manual} (digit) based on the tags used in taggedtext 
	 * @param subject
	 * @param taggedtext
	 * @return
	 */
	@SuppressWarnings("unused")
	private String formatSubject(String subject, String taggedtext) {
		String[] tokens = subject.split("\\s+");
		String formatted = "";
		for(String t: tokens){
			String tag = getTag(t, taggedtext);
			if(tag.contains("<")){
				formatted += "("+t+") ";
			}else if(tag.contains("{")){
				formatted += "{"+t+"} ";
			}else{
				formatted += t+" ";
			}
		}
		formatted = formatted.trim();
		//make sure the last word is in (), in case the word was not tagged with<> in taggedtext
		if(!formatted.endsWith(")")){
			int lasti = formatted.lastIndexOf(" ")<0 ? 0 : formatted.lastIndexOf(" ");
			String lastw = formatted.substring(lasti).replaceAll("\\W", "").trim();
			formatted = formatted.replaceAll(lastw, "("+lastw+")");
		}
		return formatted.replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ").trim();
	}
	
	/**
	 * 
	 * @param t: digit
	 * @param taggedtext: <manual> <digit>
	 * @return: <
	 */
	private String getTag(String t, String taggedtext) {
		if(taggedtext.contains("<"+t+">")) return "<";
		if(taggedtext.contains("<{"+t+"}>")) return "<";
		if(taggedtext.contains("{"+t+"}")) return "{";
		return "";
	}
	/**
	 * sent
	 * @param subject: {basal} (blade)
	 * @param sentmod basal [leaf]
	 * @return
	 */
	/*private String addSentmod(String subject, String sentmod) {
		if(sentmod.indexOf("[")>=0){
			String[] tokens = subject.split("\\s+");
			String substring = "";
			for(int i = 0; i<tokens.length; i++){
				if(!sentmod.matches(".*?\\b"+tokens[i].replaceAll("[{()}]", "")+"\\b.*")){
					substring +=tokens[i]+" ";
				}
			}
			substring = substring.trim();
			substring ="{"+sentmod.replaceAll("[\\[\\]]", "").replaceAll(" ", "} {").replaceAll("[{(]and[)}]", "and").replaceAll("[{(]or[)}]", "or").replaceAll("\\{\\}", "").replaceAll("\\s+", " ")+"} "+substring;
			return substring;
		}
		return subject;
	}*/
	/**
	 * 
	 * @param begainindex (inclusive)
	 * @param endindex (not include)
	 * @return element in the range
	 */
	public String getText(int begainindex, int endindex) {
		String text = "";
		for(int i = begainindex; i < endindex; i++){
			text += this.chunkedtokens.get(i)+" ";
		}
		return text.replaceAll("\\s+", " ").trim();
	}
	
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub

	}
	public String getTokenAt(int i) {
		//if out of bound
//		if(i<0 || i>=this.chunkedtokens.size()){
//			return null;
//		}else{
//			return this.chunkedtokens.get(i);
//		}
		
		try{
			return this.chunkedtokens.get(i);
		}catch(Exception e){
			return null;
		}
	}
	public void setClauseModifierConstraint(String modifier, String constraintId) {
		this.clauseModifierConstraint = modifier;		
		this.clauseModifierContraintId = constraintId;
	}
	public ArrayList<String> getClauseModifierConstraint() {//apply to all characters in this chunkedsentence
		if(this.clauseModifierConstraint!=null){
			ArrayList<String> mc = new ArrayList<String>();
			mc.add(this.clauseModifierConstraint);	
			if(this.clauseModifierContraintId!=null) mc.add(this.clauseModifierContraintId);
			return mc;
		}else{
			return null;
		}
	}
	

	
	
}