CharacterLearner.java example

Explorer
phenoscape-nlp-master
- parsing-gui
  - lib
    - elk-distribution-0.3.2-owlapi-library
      - examples
        org
        semanticweb
        elk
        owlapi
        examples
        QueryingUnnamedClassExpressions.java
        QueryingWithNamedClasses.java
        RetrievingInstances.java
        SavingInferredAxioms.java
  - src
    - com
      - swtdesigner
        SWTResourceManager.java
    - fna
- phenoscapeII
  - src
package fna.parsing.character;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

import fna.parsing.ApplicationUtilities;
import fna.parsing.DeHyphenizerCorrected;
import fna.parsing.Learn2Parse;

/**
 * Changes:
 * Patterns: remove "stop" from start, keep "stop" in the end [when to remove after stop? consider states learned vs. displayed. learned should include only the key term, displayed needs to show constraints and modifiers.]
 *           to make sure states extracted are of the organ, not related parts.
 * Stop: add new stop words:
 * a|above|after|all|almost|along|amp|an|and|are|as|at
 * |be|because|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|
 * |each|even|from|has|had|have|here|how|if|in|into|is|it|its
 * |less|may|might|more|most|much|near|not|of|often|on|over|should|so|some|sometimes|should
 * |than|that|the|then|there|these|this|those|toward|towards
 * |was|well|were|what|when|why|with|without|would
 * 
 * |few|frequently
 * |occasionally|often|rarely|somewhat|throughout|very";
 * 
 * False to-patterns:"reduced to", "to form", "appressed to", "in contrast to", "similar to"
 *     "confined to", "equal to", "perpendicular to", "dissimilar to","lobed to", "divided to", "invisible to"
 * 	   "adherent to", "according to", "proximal to"/"distal to", "to touch", "fused to", "attached to"
 * 	   "formed by", "axillary to", "back to back"
 * 
 * In Bootstrap: allow different characters in a group. e.g lacking or yellowish to light orangish
 * 
 * numbers: convert to NUM with state number.
 * 
 * comparative: > equal to or slightly shorter than <
 * special -ly words: mealy, scaly, prickly,
 * 
 * adv to/or adv adj patterns: sparsely to much branched => record only adj branched.
 * 
 * OR patterns: or not: scabrous or not. =>record only adj scabrous.
 *              more or less
 * ca.: , to ca. x m times or just ca.
 * 
 * list pattern: , "or meeting <"=>match single seg, most cases are fine, this one is bad.
 *
 *preposition phrases: on rock
 *
 *tag sentence: should organ names only be tagged when they appear at the beginning of a sentence? e.g. , rounded or with single <groove> [ or change simple pattern?]
 *markup character: may need to merge saved stategroups and checked states. eg. > entire or with 3 broad ,
 */

public class CharacterLearner  implements Learn2Parse{
	
	private static final Logger LOGGER = Logger.getLogger(CharacterLearner.class);
	static {
		try {
			Class.forName(ApplicationUtilities.getProperty("database.driverPath"));
		} catch (ClassNotFoundException e) {
			// TODO Auto-generated catch block
			LOGGER.error("Couldn't find Class in CharacterLearner" + e);
			e.printStackTrace();
		}
	}

	static private Connection conn = null;
	static private String database = null;
	static private String word = "(?:[\\w_]+\\s)";
	static private String num = "\\d[^a-z]+"; //5 pairs. 0 . 5 - 3 mm
	static public String stop = "a|above|after|all|almost|along|amp|an|and|are|as|at|be|because|been|before|being|beneath|between|beyond|but|by|ca|can|could|did|do|does|doing|done|each|even|from|has|had|have|here|how|if|in|into|is|it|its|less|may|might|more|most|much|near|not|of|often|on|over|should|so|some|sometimes|should|than|that|the|then|there|these|this|those|toward|towards|was|well|were|what|when|why|with|without|would|few|frequently|occasionally|often|rarely|somewhat|throughout|very";
	static private String tophrases="connate to|to ca|reduced to|to form|appressed to|in contrast to|similar to|confined to|equal to|perpendicular to|dissimilar to|lobed to|divided to|invisible to|adherent to|according to|proximal to|distal to|to touch|fused to|attached to|axillary to|back to back";
	static private String orphrases="more or less";
	static private String synonyms ="(?:>|^|,|"+stop+") ("+word+"{1,3})(\\[ (.{1,30}) \\])";
	//static private String simple = "((?:(?:^|,|>|"+stop+") "+word+"))or ("+word+"{1,})";     //a or b
	static private String simple = "((?:(?:^|,|>) "+word+"))or ("+word+"{1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")";     //a or b
	//static private String simple = "((?:(?:^|,|>|"+stop+") "+word+"))or ("+word+"{1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")";     //a or b
	//static private String list = "((?:(?:^|,|>|"+stop+") "+word+")*), or ("+word+"{1,})";   //a, b, c, or e f g
	static private String list = "((?:(?:^|,|>) "+word+")*), or ("+word+"{1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")";   //a, b, c, or e f g
	//static private String list = "((?:(?:^|,|>|"+stop+") "+word+")*), or ("+word+"{1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")";   //a, b, c, or e f g
	//static private String to =  "((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,})";
	static private String to =  "((?:(?:^|,|>) (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")";
	//static private String to =  "((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s)))to ((?:[_a-z]+\\s){1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")";
	//static private String tolist ="((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,})";
	//static private String tolist ="((?:(?:^|,|>|"+stop+") (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,}?)"+"(?:$|,|;|:|\\.|<|"+stop+")";
	static private String tolist ="((?:(?:^|,|>) (?:[_a-z]+\\s))*), to ((?:[_a-z]+\\s){1,}?)"+"(?=$|,|;|:|\\.|<|"+stop+")";

	
	private ArrayList<StateGroup> stategroups = null;
	private Hashtable<Integer, String> sentences = null;
	private Glossary glossary = null;
	private Hashtable<String, StateGroup> groups = null; 
	private String statespatterns = "";
	private String organnames = null;
	private String tablePrefix = "";
	

	public CharacterLearner(String database, String tablePrefix) {
		// read sentences in the database.sentencetable generated by unsupervised.pl
		// create StateGroups
		// bootstrap StateGroups
		this.tablePrefix = tablePrefix;
		CharacterLearner.database = database;
		this.groups = new Hashtable<String, StateGroup>();
		try{
			if(conn == null){
				String URL = ApplicationUtilities.getProperty("database.url");
				conn = DriverManager.getConnection(URL);
				Statement stmt = conn.createStatement();
				stmt.execute("create table if not exists "+this.tablePrefix+"_learnedstates (state varchar(100) NOT NULL PRIMARY KEY, count int(4))");
				stmt.execute("delete from "+this.tablePrefix+"_learnedstates");
				//Statement stmt = conn.createStatement();
				stmt.execute("update "+this.tablePrefix+"_sentence set charsegment =''");

			}
		}catch(Exception e){
			LOGGER.error("Exception in  CharacterLearner constructor" + e);
			e.printStackTrace();
		}
		//glossary is created in VolumeDehyphenizer
	//	this.glossary = new Glossary(new File(Registry.ConfigurationDirectory + "FNAGloss.txt"), true, this.database, this.tablePrefix);
		
		this.stategroups =  new ArrayList<StateGroup>();
		this.sentences = new Hashtable<Integer, String>();
		this.organnames = collectOrganNames();

		markSentences();//tag organ names
		
		parseSentences();//create StateGroups 
		bootstrap();//infer characters
		
		DeHyphenizerCorrected dh = new DeHyphenizerCorrected(database, this.tablePrefix+"_learnedstates", "state", "count", "_", this.tablePrefix, this.glossary);
		dh.deHyphen();
		this.statespatterns = collectStateNames(); //create character patterns
	}
	/*
	 * bootstrap stategroups
	 */
	private void bootstrap(){
		/*try{
			Statement stmt = conn.createStatement();
			stmt.execute("create table if not exists "+this.tablePrefix+"_bootstrap (state1 varchar(100), character1 varchar(200), PRIMARY KEY (state1, character1))");
		}catch(Exception e){
			e.printStackTrace();
		}*/
		
		//Bootstrap b = new Bootstrap(stategroups, glossary, database, "bootstrap");
		Bootstrap b = new Bootstrap(stategroups,glossary, database);
		b.go();
	}
	
	
	
	@SuppressWarnings("unused")
	private void assembleDescription(){
		try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select filename, endindex from "+this.tablePrefix+"_fileclauselink");
			int start = 0;
			while(rs.next()){
				String filename = rs.getString("filename");
				int end = rs.getInt("endindex");
				System.out.println("output "+filename);
				String content = getDescription(start, end);
				System.out.println(content);
			//	SAXBuilder builder = new SAXBuilder();
				//Document doc = builder.build(new ByteArrayInputStream(content.getBytes("UTF-8")));
			    BufferedWriter out = new BufferedWriter(new FileWriter(filename));
		        out.write(content);
		        out.close();
				start = end+1;
			}
		}catch (Exception e){
			LOGGER.error("Exception in  CharacterLearner assembleDescription" + e);
			e.printStackTrace();
		}
	}
	public ArrayList<String> getMarkedDescription(String filename){
		ArrayList<String> results = new ArrayList<String>();
		try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select filename, endindex from "+this.tablePrefix+"_fileclauselink where filename=\""+filename+"\"");
			if(rs.next()){
				int end = rs.getInt("endindex");
				rs = stmt.executeQuery("select filename, endindex from "+this.tablePrefix+"_fileclauselink where endindex<"+end+" order by endindex desc");
				int start = 0;
				if(rs.next()){
					start = rs.getInt("endindex")+1;
				}
				results.add(getDescription(start, end));
				return results;
			}
		}catch (Exception e){
			LOGGER.error("Exception in  CharacterLearner getMarkedDescription" + e);
			e.printStackTrace();
		}
		return results;
	}
	
	private String getDescription(int start, int end) throws SQLException {
		String content = "<?xml version=\"1.0\"?><description>";
		for(int i = start; i <= end; i++){
			Statement stmt1 = conn.createStatement();
			ResultSet rs1 = stmt1.executeQuery("select clause, tag, attributes, modifier from "+this.tablePrefix+"_clause where clauseid="+i);
			rs1.next();
			String sent = rs1.getString("clause");
			sent = sent==null? "" : sent.trim();
			String atts = rs1.getString("attributes");
			atts = atts==null? "" : atts.trim();
			String modifier = rs1.getString("modifier");
			modifier = modifier==null? "" : modifier.trim();
			modifier = modifier.replaceAll("\\s+", "_");
			String tag = rs1.getString("tag");
			tag = tag==null? "" : tag.trim();

			tag = tag.replaceFirst("\\b(2n|n|x)\\b", "chromosomes");
			tag = tag.replaceAll("\\s+", "_");
			
			
			String starttag ="";
			if(modifier!=null && modifier.compareTo("")!= 0 && modifier.compareTo("null")!= 0){
				starttag +=modifier+"_"+tag;
			}else{
				starttag +=tag.trim();
			}
			starttag = starttag.replaceAll("^\\d+\\s*", "").replaceAll("\\W", "");
			String endtag = "</"+starttag+">";
			if(atts.compareTo("")!=0){
				starttag += " "+atts;
			}
			starttag = "<"+starttag+">";
			content += starttag+sent+endtag;
		}
		content+="</description>";
		return unhide(content.replaceAll("[}{]", ""));
	}
	
	private String generateAttributes(String sent, String charsegment){
		if(sent.compareTo("or perennial ;") == 0){
			System.out.println();
		}
		String attributes = "";
		//TODO sort atts, 
		//TODO no duplicated atts are allowed in an xml tag
		//TODO deal with comparisons between two organs.
		Hashtable<String, String> atts = new Hashtable<String, String>(); //collect attributes then sort them alphabetically
		//deal with numbers:size
		Pattern p = Pattern.compile("(.*?) ("+num+")(cm|mm|m|dm|meters|meter)\\b(.*)");
		Matcher m = p.matcher(sent);
		while(m.find()){
			String value = m.group(2).trim()+ " "+m.group(3);
			if(atts.get("size") == null){
				atts.put("size", value);
			}else{
				atts.put("size", atts.get("size")+";"+value);
			}
			sent = m.group(1)+m.group(4);
			m = p.matcher(sent);
		}
		//deal with numbers:count
		p = Pattern.compile("(.*?) ("+num+")(.*)");
		m = p.matcher(sent);
		while(m.find()){
			String value = m.group(2).replaceAll("\\W+$", "");
			if (value.indexOf('/')<0){
				if(atts.get("count") == null){
					atts.put("count", value);
				}else{
					atts.put("count", atts.get("count")+";"+value);
				}
			}
			sent = m.group(1)+m.group(3);
			m = p.matcher(sent);
		}
		
		if(charsegment != null && charsegment.compareTo("")!=0){
			String[] segs = charsegment.split(";");
			for(int i = 0; i < segs.length; i++){
				String[] parts = segs[i].split("#");
				String text = parts[0];
				String exp = parts[1];
				sent = sent.replace(text, " ");
				StateGroup sg = (StateGroup)groups.get(exp);
				String att = sg.mostFreqCategory().replaceFirst("#.*","").replaceAll("\\s+", "_");
				if(att.compareTo("") != 0){
					String value = text;//TODO to or patterns
					if(atts.get(att) == null){
						atts.put(att, value);
					}else{
						atts.put(att, atts.get(att)+";"+value);
					}
				}
			}
		}
		//TODO deal with negations 
		
		Pattern pattern = Pattern.compile("((?:(?:not|rarely|barely|seldom) (?:\\w+ )?)?\\b("+statespatterns+")\\b)");
		m = pattern.matcher(sent);
		while(m.find()){
			String state = m.group(2);
			String value = m.group(1);
			ArrayList<?> chars = Glossary.getCharacter(state);
			if(chars.size() >0){
				Iterator<?> it = chars.iterator();
				String att = "";
				while(it.hasNext()){
					att += ((String)it.next()).replaceAll("\\s+", "_")+"_or_";
				}
				att = att.replaceFirst("_or_$", "");
			
				if(atts.get(att) == null){
					atts.put(att, value);
				}else{
					atts.put(att, atts.get(att)+";"+value);
				}
			}
		}
		//sort atts
		Set<String> keys = atts.keySet();
		String[] keyarray = (String[])keys.toArray(new String[]{});
		Arrays.sort(keyarray);
		for(int i = 0; i<keyarray.length; i++){
			String att = keyarray[i]+"='"+(String)atts.get(keyarray[i])+"'";
			attributes += att+" ";
		}
		return attributes.trim(); 
	}
	/**
	 * parse or and to patterns, 
	 * save each pattern as a group of state (StateGroup)
	 * check for characters for a state
	 * save patterns in a sorted collection 
	 */
	private void parseSentences(){
		Enumeration<Integer> en = sentences.keys();
		while(en.hasMoreElements()){
			Integer key = (Integer)en.nextElement();
			int sentid = key.intValue();
			String taggedsent = (String)sentences.get(key);
			parseSentence(sentid, taggedsent);
		}
	}
	
	private void parseSentence(int sentid, String sent){
		boolean match = false;
		do{
			match = false;
			String copy = sent;
			if(sent.indexOf("glabrous or floccose to tomentose or lanate")>=0){
				System.out.println();
			}
			//sent = doSynonyms(sentid, sent);
			sent = doSimple(sentid, sent);
			sent = doList(sentid, sent);
			sent = doTo(sentid,sent);
			sent = doToList(sentid, sent);
			if(copy.compareTo(sent) != 0){
				match = true;
			}
		}while(match);
	}
	private String doToList(int sentid, String sent){
		Pattern tolistp = Pattern.compile(CharacterLearner.tolist);
		Matcher m = tolistp.matcher(sent);
		if(m.find()){
			String seg = sent.substring(m.start(), m.end());
			System.out.println( "\t"+seg);
		    String t1 = normalize(m.group(1));
		    String t2 = normalize(m.group(2));
		    String [] terms = t1.split("\\s*,\\s*");
		    List<String> list = Arrays.asList(terms);
		    ArrayList<String> alist = new ArrayList<String>(list);
		    alist.add(t2);
		    //if(alist.size() > 1){
		    	group(alist, sentid, seg);
		    	System.out.println("["+t1+"] and ["+t2+"] are in the same group [tolist]");
		    //}else{
		    	System.out.println("["+t1+"] and ["+t2+"] were not put in the same group [tolist]");
		    //}
		    sent = sent.replaceFirst(CharacterLearner.tolist, "");
		}
	    return sent;
	}
	private String doTo(int sentid, String sent){
		Pattern top = Pattern.compile(CharacterLearner.to);
		Matcher m = top.matcher(sent);
		if(m.find()){
			String seg = sent.substring(m.start(), m.end());
			System.out.println( "\t"+seg);
			String t1 = normalize(m.group(1));
			String t2 = normalize(m.group(2));
			//if(t1.compareTo("") != 0 && t2.compareTo("") != 0){
				ArrayList<String> list = new ArrayList<String>();
				list.add(t1);
				System.out.print("["+t1+"] ");
				String[] t2s = t2.split("\\b(to|or)\\b");
				for(int i = 0; i<t2s.length; i++){
					list.add(normalize(t2s[i]));
					System.out.print("["+t2s[i]+"] ");
				}
				group(list, sentid, seg);
				System.out.println(" are in the same group [to]\n");
			//}else{
			//	System.out.println("["+t1+"] and ["+t2+"] were not put in the same group [to]\n");
			//}
				sent = sent.replaceFirst(CharacterLearner.to, "");
		}
	    return sent;
	}
	private String doList(int sentid, String sent){

		Pattern listp = Pattern.compile(CharacterLearner.list);
		Matcher m = listp.matcher(sent);
		if(m.find()){
			String seg = sent.substring(m.start(), m.end());
			System.out.println( "\t"+seg);
			String t1 = m.group(1);
			String t2 = m.group(2);
	        String [] terms = t1.split("\\s*,\\s*");
	        List<String> list = Arrays.asList(terms);
	        ArrayList<String> alist = new ArrayList<String>(list);
			if (alist.size() >= 3){
				alist.remove(0); //be conservative to avoid sessile, rhomic, lanceolate, or oblanceolate
			}
			alist.add(t2);
			//if(alist.size() > 1){
				for(int i = 0; i < alist.size(); i++){
					alist.set(i, normalize((String)alist.get(i)));
				}
				group(alist, sentid, seg);
				System.out.println ("["+t1+"] and ["+t2+"] are in the same group [list]\n");
			//}else{
			//	System.out.println ("["+t1+"] and ["+t2+"] were not put in the same group [list]\n");
			//}
			sent = sent.replaceFirst(CharacterLearner.list, "");
		}
	    return sent;
	}
	
	private String doSimple(int sentid, String sent){
		Pattern simplep = Pattern.compile(CharacterLearner.simple);
		Matcher m = simplep.matcher(sent);
		if(m.find()){
			String seg = sent.substring(m.start(), m.end());
			System.out.println("\t"+seg);
			String t1 = normalize(m.group(1));
			String t2 = normalize(m.group(2));
			if(t2.length()<30){
			//if(t1.compareTo("") != 0 && t2.compareTo("") != 0){
				ArrayList<String> list = new ArrayList<String>();
				list.add(t1);
				System.out.print("["+t1+"] "); 
				String[] t2s = t2.split("\\b(to|or)\\b");
				for(int i = 0; i<t2s.length; i++){
					list.add(normalize(t2s[i]));
					System.out.print("["+t2s[i]+"] "); 
				}
				group(list, sentid, seg);
				System.out.println(" are in the same group [simple]\n");
			//}else{
			//	System.out.println("["+t1+"] and ["+t2+"] were not put in the same group [simple]\n");
			//}
			}
	        sent = sent.replaceFirst(CharacterLearner.simple, "");
		}
	    return sent;
	}
	
	@SuppressWarnings("unused")
	private String doSynonyms(int sentid, String sent){
		Pattern synonymsp = Pattern.compile(CharacterLearner.synonyms);
		Matcher m = synonymsp.matcher(sent);
		if(m.find()){
			String seg = sent.substring(m.start(), m.end());
			System.out.println("\t"+seg);
			String save = m.group(2);
			String t1 = normalize(m.group(1));
			String t2 = normalize(m.group(3));
			String[] terms = t2.split("\\s*(\\bor\\b|,)\\s*");
			List<String> list = Arrays.asList(terms);
			list.add(t1);
			//if(list.size() > 1){
				group(list, sentid, seg);
				System.out.println("[t1] and [t2] are in the same group [syn]");
			//}else{
				//System.out.println("[t1] and [t2] were not put in the same group [syn]");
			//}
			save = save.replaceAll("\\[", "\\[").replaceAll("\\]", "\\]");
			sent = sent.replaceFirst(" "+save, "");
		}
		return sent;
	}
	/**
	 * check against glossary
	 */
	private void group(List<String> terms, int clauseid, String matchedseg){
		Iterator<String> it = terms.iterator();
		StateGroup g = new StateGroup();
		while(it.hasNext()){
			String term = ((String) it.next()).trim();
			String[] tmp = new String[1];
			tmp[0] = term;
			if(term.matches(".*?\\b(or|to)\\b.*")){
				tmp = term.split("\\s*(or|to)\\s*");
			}
			for(int i=0; i<tmp.length; i++){
				if(tmp[i].compareTo("") != 0 && !tmp[i].matches(".*?\\b("+this.organnames+")\\b.*")){
					String t = add2LearnedStates(tmp[i]);
					State s = new State(t, glossary);
					g.addState(s);
					
				}
			}
		}
		String exp = g.toString();
		if(exp.compareTo("") != 0){
			if(this.groups.containsKey(exp)){
				((StateGroup)this.groups.get(exp)).increment();
			}else{
				stategroups.add(g); //duplicates will not be added
				this.groups.put(exp, g);
			}
		
			matchedseg = matchedseg.replaceAll("[><;,\\.]", "").trim();
			matchedseg = matchedseg+"#"+exp;
			//sentence
			try{
				Statement stmt = conn.createStatement();
				ResultSet rs = stmt.executeQuery("select charsegment from "+this.tablePrefix+"_sentence where sentid="+clauseid);
				rs.next();
				String tmp =rs.getString("charsegment");
				if(tmp != null && tmp.compareTo("") !=0){
					matchedseg = tmp+";"+matchedseg; // seg#exp;seg#exp
				}
				stmt.execute("update "+this.tablePrefix+"_sentence set charsegment =\""+matchedseg+"\" where sentid ="+clauseid);
			}catch (Exception e){
				LOGGER.error("Exception in  CharacterLearner group" + e);
				e.printStackTrace();
			}
			/*clause
			 * try{
				Statement stmt = conn.createStatement();
				ResultSet rs = stmt.executeQuery("select charsegment from "+this.tablePrefix+"_clause where clauseid="+clauseid);
				rs.next();
				String tmp =rs.getString("charsegment");
				if(tmp != null && tmp.compareTo("") !=0){
					matchedseg = tmp+";"+matchedseg; // seg#exp;seg#exp
				}
				stmt.execute("update "+this.tablePrefix+"_clause set charsegment =\""+matchedseg+"\" where clauseid ="+clauseid);
			}catch (Exception e){
				e.printStackTrace();
			}*/
		}
	}
	
	private String add2LearnedStates(String term){
		String t = null;
		//if(term.indexOf("_")<0){ //normal term without "-"
			t = term;
			try{
				Statement stmt =conn.createStatement();
				ResultSet rs = stmt.executeQuery("select state, count from "+this.tablePrefix+"_learnedstates where state ='"+term+"'");
				if(rs.next()){
					int count = rs.getInt("count")+1;
					stmt.execute("update "+this.tablePrefix+"_learnedstates set count ="+count+" where state ='"+term+"'");
				}else{
					stmt.execute("insert into "+this.tablePrefix+"_learnedstates values('"+term+"', 1)");
				}
			}catch (Exception e){
				LOGGER.error("Exception in  CharacterLearner add2LearnedStates" + e);
				e.printStackTrace();
			}
		/*}else{
			String t1 = term.replaceAll("_", ""); 
			String t2 = term.replaceAll("_", " "); 
			try{
				Statement stmt =conn.createStatement();
				ResultSet rs = stmt.executeQuery("select state, count from "+this.tablePrefix+"_learnedstates where state ='"+t1+"'");
				if(rs.next()){//use t1
					int count = rs.getInt("count")+1;
					stmt.execute("update "+this.tablePrefix+"_learnedstates set count ="+count+" where state ='"+t1+"'");
					t = t1;
				}else{//use t2
					rs = stmt.executeQuery("select state, count from "+this.tablePrefix+"_learnedstates where state ='"+t2+"'");
					if(rs.next()){
						int count = rs.getInt("count")+1;
						stmt.execute("update "+this.tablePrefix+"_learnedstates set count ="+count+" where state ='"+t2+"'");
					}else{
						stmt.execute("insert into "+this.tablePrefix+"_learnedstates values('"+t2+"', 1)");
					}
					t = t2;
				}
			}catch (Exception e){
				e.printStackTrace();
			}
		}*/
		return t;
	}
	
	private String normalize(String sent){
		if(sent == null){return sent;}
		sent = sent.replaceAll("[<>,;.]", "")
		.replaceAll("\\b("+stop+")\\b", "").replaceAll("\\d+", "").replaceAll("\\b[_a-z]*?ly\\b","")
		.replaceFirst("^to_", "to").replaceFirst("^or_", "or")./*replaceAll("_", " ").*/replaceAll("\\s+", " ").replaceFirst("^\\s+", "").replaceFirst("\\s+$", "");
		return sent.trim().toLowerCase();
	}
	
	private String collectStateNames(){
		StringBuffer tags = new StringBuffer();
		try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select distinct state from "+this.tablePrefix+"_learnedstates");
			while(rs.next()){
				String tag = rs.getString("state");
				if(tag == null){continue;}
				tags.append(tag+"|");
			}
		}catch(Exception e){
			LOGGER.error("Exception in  CharacterLearner collectStateNames" + e);
			e.printStackTrace();
		}
		return tags.toString()+Glossary.getAllCharacters();
	}
	
	private String collectOrganNames(){
		StringBuffer tags = new StringBuffer();
		try{
		Statement stmt = conn.createStatement();
		//ResultSet rs = stmt.executeQuery("select distinct term from fna.fnaglossary where category in ('STRUCTURE', 'CHARACTER', 'FEATURE', 'SUBSTANCE', 'PLANT')");
		ResultSet rs = stmt.executeQuery("select distinct term from "+this.tablePrefix+"_fnaglossary where category in ('STRUCTURE', 'SUBSTANCE', 'PLANT')");
		while(rs.next()){
			String tag = rs.getString("term");
			if(tag == null){continue;}
			tags.append(tag+"|");
		}
		rs = stmt.executeQuery("select distinct tag from "+this.tablePrefix+"_sentence");
		while(rs.next()){
			String tag = rs.getString("tag");
			if(tag == null || tags.indexOf("|"+tag+"|") >= 0){continue;}
			tags.append(tag+"|");
		}
		//find pl. form
		rs = stmt.executeQuery("select word from "+this.tablePrefix+"_wordpos where pos = \"p\"");
		while(rs.next()){
			tags.append(rs.getString("word").trim()+"|");
		}
		tags = tags.replace(tags.lastIndexOf("|"), tags.lastIndexOf("|")+1, "");
		}catch(Exception e){
			LOGGER.error("Exception in  CharacterLearner collectOrganNames" + e);
			e.printStackTrace();
		}
		return tags.toString();
		/*StringBuffer names = new StringBuffer();
		try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select distinct tag from "+this.tablePrefix+"_sentence order by tag");
			while(rs.next()){
				names.append(rs.getString("tag")+"|");
			}
			rs = stmt.executeQuery("select distinct word from "+this.tablePrefix+"_wordpos where pos='p' order by word");
			while(rs.next()){
				names.append(rs.getString("word")+"|");
			}
		}catch(Exception e){
			e.printStackTrace();
		}
		return names.toString().replaceFirst("\\|$", "");*/
	}
/**
 * collect tag names (sing. and pl. forms)
	mark sentences one by one using tags
	tag only the starting words of a sentence.
	convert numbers to NUM
 * @param sentencetable
 */
	private void markSentences(){
		try{
			Pattern tagsp = Pattern.compile("(.*?)\\b("+this.organnames+")\\b(.*)", Pattern.CASE_INSENSITIVE);
			//now mark sentence one by one, add marked sentences in this.sentences
			//break sentence into meaningful clauses (each with a marked subject)
			//create a clause table to save the clauses
			//tracking the relation between filenames and clauses, saving this info in 
			//a new table fileclauselink (filename, endindex of the last clause in the file).
			/*moved to sentences2clauses
			 stmt.execute("create table if not exists "+this.tablePrefix+"_clause (clauseid int(11) not null primary key, tag varchar(150), modifier varchar(150), clause varchar(500), charsegment varchar(250), attributes varchar(500))");
			stmt.execute("delete from "+this.tablePrefix+"_clause");
			stmt.execute("create table if not exists "+this.tablePrefix+"_fileclauselink (filename varchar(200) not null primary key, endindex int(11))");
			stmt.execute("delete from "+this.tablePrefix+"_fileclauselink");
			*/
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select count(sentid) from "+this.tablePrefix+"_sentence");
			rs.next();
			int total = rs.getInt(1);
			int sentid = 0;
			String[] tos = CharacterLearner.tophrases.split("\\|");
			String[] ors = CharacterLearner.orphrases.split("\\|");
			
			while(sentid < total){
				//rs = stmt.executeQuery("select tag, modifier, sentence from "+this.tablePrefix+"_sentence where sentid="+sentid+""); Hong: 10/17/09

				rs = stmt.executeQuery("select tag, modifier, originalsent from "+this.tablePrefix+"_sentence where sentid="+sentid+"");
				if(rs.next()){
				String sent = (String)rs.getString("originalsent"); //Partha 10/17/09
				//String sent = (String)rs.getString("sentence"); 
				sent = sent.replaceAll("\\([^)]*\\)", ""); ///Hong 10/17/09 added 3 lines
				sent = sent.replaceAll("\\{[^}]*\\}", "");
				sent = sent.replaceAll("\\[[^]]*\\]", "");
				//sent = sent.replaceAll(this.num, "NUM "); //all numbers => NUM
				if(sent.matches(".*?("+CharacterLearner.tophrases+").*")){
				sent = hide(tos, sent);
				}
				if(sent.matches(".*?("+CharacterLearner.orphrases+").*")){
				sent = hide(ors, sent);
				}

				String taggedsent = "";
				/*String[] sts = sent.split("\\s*,\\s*");
				Matcher m = null;
				for(int i = 0; i< sts.length; i++){
					if(i!=0){ sts[i] = " , "+sts[i];}
					m = p.matcher(sts[i]);
					if(m.matches()){ //tag the first mentioning of an organ in a sentence
						taggedsent += m.group(1)+"<"+m.group(2)+">"+m.group(3);
					}else{
						taggedsent +=sts[i];
					}
				}*/
				Matcher m = tagsp.matcher(sent);
				while(m.matches()){
					taggedsent += m.group(1)+"<"+m.group(2)+">";
					sent = m.group(3);
					m = tagsp.matcher(sent);
			    }
				taggedsent +=sent;
				/* seg clauses should be done later, after the learning of states.
				Pattern p = Pattern.compile(", (\\w+)? ?(<.*?>)");//the word after , should not be connectors such as "or"
				Matcher m2 = p.matcher(taggedsent);
				int start = 0;
				while(m2.find()){
					if(m2.group(1)==null || m2.group(1).compareTo("or") != 0){//the word after , should not be connectors such as "or"
						int end = m2.start(); //this ends a clause
						String taggedclause = taggedsent.substring(start, end+1);
						addClause(sentid, sentid+offset, tag, modifier, taggedclause, false);
						offset++;
						start = end+1;
						modifier = m2.group(1);
						tag = m2.group(2).replaceAll("[<>]", "");
					}
				}
				String taggedclause = taggedsent.substring(start);
				addClause(sentid, sentid+offset, tag, modifier, taggedclause, true);
				*/
				sentences.put(new Integer(sentid), taggedsent); //do this in addClause
				}
				sentid++;
				//System.out.println(sentid);
				
			}
		}catch (Exception e){
			LOGGER.error("Exception in  CharacterLearner markSentences" + e);
			e.printStackTrace();
		}
	}
	
	private String hide(String[] phrases, String str){
		for(int i = 0; i < phrases.length; i++){
			String hidden = phrases[i].replaceAll("\\s+", "*");
			str = str.replaceAll(phrases[i], hidden);
		}
		return str;
	}

	private String unhide(String str){
		str = str.replaceAll("\\*", " ").trim();
		return str;
	}
	
	private void createClauseTables(){
		try{
			Statement stmt = conn.createStatement();
			stmt.execute("create table if not exists "+this.tablePrefix+"_clause (clauseid int(11) not null primary key, tag varchar(150), modifier varchar(150), clause varchar(500), charsegment varchar(500), attributes varchar(500))");
			stmt.execute("delete from "+this.tablePrefix+"_clause");
			stmt.execute("create table if not exists "+this.tablePrefix+"_fileclauselink (filename varchar(200) not null primary key, endindex int(11))");
			stmt.execute("delete from "+this.tablePrefix+"_fileclauselink");
		}catch(Exception e){
			LOGGER.error("Exception in  CharacterLearner createClauseTables" + e);
			e.printStackTrace();
		}
	}
	/**
	 * tag states with {}
	 * @param sentence, with organ names tagged, e.g. <leaves> basal.
	 * @return
	 */
	private String tagStates(String sent){
		String taggedsent = "";
		Pattern tagsp = Pattern.compile("(.*?)\\b("+this.statespatterns+")\\b(.*)", Pattern.CASE_INSENSITIVE);
		Matcher m = tagsp.matcher(sent);
		while(m.matches()){
			taggedsent += m.group(1)+"{"+m.group(2)+"}";
			sent = m.group(3);
			m = tagsp.matcher(sent);
	    }
		taggedsent +=sent;
		return taggedsent.replaceAll("\\} \\{", " ");
	}
	
	private String[] getInfo(int sentid){
		String[] info = new String[3];
		try{
			Statement stmt = conn.createStatement();
			ResultSet rs = stmt.executeQuery("select tag, modifier, charsegment from "+this.tablePrefix+"_sentence where sentid="+sentid+"");
			rs.next();
			info[0] = rs.getString("tag");
			info[1]=  rs.getString("modifier");
			info[2] = rs.getString("charsegment");
		}catch (Exception e){
			LOGGER.error("Exception in  CharacterLearner getInfo" + e);
			e.printStackTrace();
		}
		return info;
	}
	/**
	 * 
	 * @param taggedclause
	 * @param charsegment
	 * @return [0] the segment of charsegment matching this taggedclause, [1] the rest
	 */
	private String[] splitCharSegment(String taggedclause, String charsegment){
		String[] splits = {"",""};
		if(charsegment == null || charsegment.trim().compareTo("") == 0){
			return splits;
		}
		String[] segs = charsegment.split(";");
		int i = 0;
		for(i = 0; i<segs.length; i++){
			String[] parts = segs[i].split("#");
			if(taggedclause.indexOf(parts[0]) >= 0){
				splits[0] += segs[i]+";";
			}else{
				break;
			}
		}
		for(int j = i; j<segs.length; j++){
			splits[1] += segs[j]+";";
		}
		
		splits[0] = splits[0].replaceFirst(";$", "");
		splits[1] = splits[1].replaceFirst(";$", "");
		
		return splits;
	}
	
	private void sentences2clauses(){
		createClauseTables();
		int total = sentences.size();
		int offset = 0;
		for(int sentid = 0; sentid < total; sentid++){
			if(sentid == 567){
				System.out.println();
			}
			String taggedsent = (String)sentences.get(new Integer(sentid));
			taggedsent = tagStates(taggedsent);
			String[] info = getInfo(sentid);
			String tag = info[0];
			String modifier = info[1];
			String charsegment = info[2]; //TODO split charsegment among clauses.
			Pattern p = Pattern.compile(", (\\{[^{]*?\\})? ?(<\\w*?>)");
			Matcher m2 = p.matcher(taggedsent);
			int start = 0;
			while(m2.find()){
				//if(m2.group(1)==null ){
					int end = m2.start(); //this ends a clause
					String taggedclause = taggedsent.substring(start, end+1);
					taggedclause = taggedclause.replaceAll("[}{]", "");
					String[] segs = splitCharSegment(taggedclause, charsegment);
					charsegment = segs[1];
					addClause(sentid, sentid+offset, tag, modifier, taggedclause, segs[0], false);
					offset++;
					start = end+1;
					modifier = m2.group(1)==null? "" : m2.group(1);
					tag = m2.group(2).replaceAll("[<>]", "");
				//}
			}
			String taggedclause = taggedsent.substring(start);
			taggedclause = taggedclause.replaceAll("[}{]", "");
			addClause(sentid, sentid+offset, tag, modifier, taggedclause, charsegment, true);
		}
	}
	/**
	 * update clause table and fileclauselink table, and clause hashtable
	 * @param clauseid
	 * @param tag
	 * @param modifier
	 * @param clause
	 */
	private void addClause(int sentid, int clauseid, String tag, String modifier, String taggedclause, String charsegment, boolean lastclause){
		//remove <> from taggedclause before put it in the clause table
		//remove 2nd and later sets of <> from taggedclause before put into sentences (renamed to clauses) <pollen><grains> a b c d e ...
		Pattern p = Pattern.compile("^([^>]*?)> <(.*)");
		Matcher m = p.matcher(taggedclause);
		if(m.matches()){
			taggedclause = m.group(1)+"@"+m.group(2);
		}
		String tmp = taggedclause.replaceFirst("<", "#").replaceFirst(">", "##");
		tmp = tmp.replaceAll("[<>]", "");
		tmp = tmp.replaceFirst("##", ">");
		tmp = tmp.replaceFirst("#", "<");
		tmp = tmp.replaceFirst("@", "> <");
		//sentences.put(new Integer(clauseid), tmp);
		tmp = tmp.replaceAll("[<>]", "");
		charsegment = charsegment==null || charsegment.trim().compareToIgnoreCase("null")==0 ? "" : charsegment;
		charsegment = charsegment.trim();
		try{
			Statement stmt = conn.createStatement();
			stmt.execute("insert into "+this.tablePrefix+"_clause (clauseid, tag, modifier, clause, charsegment) values("+clauseid+", '"+tag+"', '"+modifier+"', '"+tmp+"', '"+charsegment+"')");
			if(lastclause){
				ResultSet rs = stmt.executeQuery("select filename from "+this.tablePrefix+"_sentinfile where endindex="+sentid);
				if(rs.next()){
					String fname = rs.getString("filename");
					stmt.execute("insert into "+this.tablePrefix+"_fileclauselink values ('"+fname+"', '"+clauseid+"')");
				}
			}
		}catch (Exception e){
			LOGGER.error("Exception in  CharacterLearner addClause" + e);
			e.printStackTrace();
		}
		
	}
	
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
	/*	String gfile = "C://Documents and Settings//hongcui//Desktop//WorkFeb2008//FNA//FNAGloss.txt";
		CharacterLearner cl = new CharacterLearner("fnav5_corpus", "fna");
		cl.markupCharState();
		cl.assembleDescription();
		cl.getMarkedDescription("1.xml");*/

	}

}