VolumeTransformer.java example

Explorer
phenoscape-nlp-master
- parsing-gui
  - lib
    - elk-distribution-0.3.2-owlapi-library
      - examples
        org
        semanticweb
        elk
        owlapi
        examples
        QueryingUnnamedClassExpressions.java
        QueryingWithNamedClasses.java
        RetrievingInstances.java
        SavingInferredAxioms.java
  - src
    - com
      - swtdesigner
        SWTResourceManager.java
    - fna
- phenoscapeII
  - src
/**
 * $Id: VolumeTransformer.java 996 2011-10-07 01:13:47Z hong1.cui $
 */
package fna.parsing;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;

import fna.db.VolumeTransformerDbAccess;

/**
 * To transform the extracted data to the xml format.
 * 
 * Note: before the transformation, the data should pass the check without
 * error.
 * 
 * @author chunshui
 */
@SuppressWarnings({ "unchecked", "unused","static-access" })
public class VolumeTransformer extends Thread {
	
	private static String organnames ="2n|achene|anther|apex|awn|ax|bark|beak|blade|bract|bracteole|branch|branchlet|broad|calyx|capsule|cap_sule|caropohore|carpophore|caudex|cluster|corolla|corona|crown|cup_|cusp|cyme|cymule|embryo|endosperm|fascicle|filament|flower|fruit|head|herb|homophyllous|hypanthium|hypanth_ium|indument|inflore|inflorescence|inflores_cence|inflo_rescence|internode|involucre|invo_lucre|in_florescence|in_ternode|leaf|limb|lobe|margin|midvein|nectary|node|ocrea|ocreola|ovary|ovule|pair|papilla|pedicel|pedicle|peduncle|perennial|perianth|petal|petiole|plant|prickle|rhizome|rhi_zome|root|rootstock|rosette|scape|seed|sepal|shoot|spikelet|spur|stamen|stem|stigma|stipule|sti_pule|structure|style|subshrub|taproot|taprooted|tap_root|tendril|tepal|testa|tooth|tree|tube|tubercle|tubercule|tuft|twig|utricle|vein|vine|wing|x";
	private static String organnamep ="achenes|anthers|awns|axes|blades|bracteoles|bracts|branches|buds|bumps|calyces|capsules|clusters|crescents|crowns|cusps|cymes|cymules|ends|escences|fascicles|filaments|flowers|fruits|heads|herbs|hoods|inflores|inflorescences|internodes|involucres|leaves|lengths|limbs|lobes|margins|midribs|midveins|nectaries|nodes|ocreae|ocreolae|ovules|pairs|papillae|pedicels|pedicles|peduncles|perennials|perianths|petals|petioles|pistils|plants|prickles|pules|rescences|rhizomes|rhi_zomes|roots|rows|scapes|seeds|sepals|shoots|spikelets|stamens|staminodes|stems|stigmas|stipules|sti_pules|structures|styles|subshrubs|taproots|tap_roots|teeth|tendrils|tepals|trees|tubercles|tubercules|tubes|tufts|twigs|utricles|veins|vines|wings";
	private static String usstates ="Ala\\.|Alaska|Ariz\\.|Ark\\.|Calif\\.|Colo\\.|Conn\\.|Del\\.|D\\.C\\.|Fla\\.|Ga\\.|Idaho|Ill\\.|Ind\\.|Iowa|Kans\\.|Ky\\.|La\\.|Maine|Md\\.|Mass\\.|Mich\\.|Minn\\.|Miss\\.|Mo\\.|Mont\\.|Nebr\\.|Nev\\.|N\\.H\\.|N\\.J\\.|N\\.Mex\\.|N\\.Y\\.|N\\.C\\.|N\\.Dak\\.|Ohio|Okla\\.|Oreg\\.|Pa\\.|R\\.I\\.|S\\.C\\.|S\\.Dak\\.|Tenn\\.|Tex\\.|Utah|Vt\\.|Va\\.|Wash\\.|W\\.Va\\.|Wis\\.|Wyo\\.";	
	private static String caprovinces="Alta\\.|B\\.C\\.|Man\\.|N\\.B\\.|Nfld\\. and Labr|N\\.W\\.T\\.|N\\.S\\.|Nunavut|Ont\\.|P\\.E\\.I\\.|Que\\.|Sask\\.|Yukon";
	private Properties styleMappings;
	private TaxonIndexer ti;
	private ProcessListener listener;
	//private Hashtable errors;
	//TODO: put the following in a conf file. same for those in volumeExtractor.java
	//private String start = "^Heading.*"; //starts a treatment
	private String start = VolumeExtractor.getStart(); //starts a treatment
	private String names = ".*?(Syn|Name).*"; //other interesting names worth parsing
	private String conservednamestatement ="(name conserved|nom. cons.)";
	private static final Logger LOGGER = Logger.getLogger(VolumeTransformer.class);
	private VolumeTransformerDbAccess vtDbA = null;	
	//private Hashtable<?, ?> ranks;

	private String taxontable = null;
	private String authortable = null;
	private String publicationtable = null;
	private Connection conn = null;
	private String dataPrefix;
	
	private boolean debug = false;
	private boolean debugref = false;
	private boolean debugkey = true;
	
	public VolumeTransformer(ProcessListener listener, String dataPrefix) throws ParsingException {
		this.listener = listener;
		this.dataPrefix = dataPrefix;
		//this.errors = new Hashtable();
		this.taxontable = dataPrefix.trim()+"_"	+ ApplicationUtilities.getProperty("taxontable");
		this.authortable = dataPrefix.trim() + "_" + ApplicationUtilities.getProperty("authortable");
		this.publicationtable = dataPrefix.trim() + "_" + ApplicationUtilities.getProperty("publicationtable");
		vtDbA = new VolumeTransformerDbAccess(dataPrefix);
		
		ti = TaxonIndexer.loadUpdated(Registry.ConfigurationDirectory);
		if(ti.emptyNumbers() || ti.emptyNames()) ti = null;
		
		// load style mapping
		styleMappings = new Properties();
		try {
			styleMappings.load(new FileInputStream(
					Registry.ConfigurationDirectory+System.getProperty("file.separator")+"style-mapping.properties"));
		} catch (IOException e) {
			throw new ParsingException(
					"Failed to load the style mapping file!", e);
		}
		
		try{
			if(conn == null){
				String URL = ApplicationUtilities.getProperty("database.url");
				conn = DriverManager.getConnection(URL);
				Statement stmt = conn.createStatement();
				stmt.execute("drop table if exists "+taxontable);
				stmt.execute("create table if not exists "+taxontable+" (taxonnumber varchar(10), name varchar(500), rank varchar(20), filenumber int)");
				stmt.execute("drop table if exists "+authortable);
				stmt.execute("create table if not exists "+ authortable+" (authority varchar(500) NOT NULL)");
				stmt.execute("drop table if exists "+publicationtable);
				stmt.execute("create table if not exists "+ publicationtable+" (publication varchar(500) NOT NULL)");				
			}
		}catch(Exception e){
			LOGGER.error("VolumeTransformer : Database error in constructor", e);
			e.printStackTrace();
		}	

	}

	
	/**
	 * Transform the extracted data to the xml format.
	 */
	public void run() {
		listener.setProgressBarVisible(true);
		transform();
		listener.setProgressBarVisible(false);
	}
	public void transform() throws ParsingException {
		//add start
		List<String> idlist = new ArrayList<String>();
		int iteratorcount = 0;
		String state = "", preid = "", id = "", nextstep = "";
		String split[] = new String[3];
		String split1[] = new String[30];
		String latin[] = new String[300];
		latin[0] = "a";
		latin[1] = "b";
		latin[2] = "c";
		latin[3] = "d";
		latin[4] = "e";
		latin[5] = "f";
		latin[6] = "g";
		latin[7] = "h";
		latin[8] = "i";
		//add end
		// get the extracted files list
		File source = new File(Registry.TargetDirectory, ApplicationUtilities.getProperty("EXTRACTED"));
		int total = source.listFiles().length;
		listener.progress(1);
		try {
			for (int count = 1; count <= total; count++) {

				File file = new File(source, count + ".xml");
				// logger.info("Start to process: " + file.getName());

				SAXBuilder builder = new SAXBuilder();
				Document doc = builder.build(file);
				Element root = doc.getRootElement();

				Element treatment = new Element("treatment");
				Element e2 = new Element("key");
				List<Element> plist = XPath.selectNodes(root, "/treatment/paragraph");
				int textcount = 0, nextstepid = 0;
				String ptexttag ="";
				String idstorage = "1";
				
				for (Iterator<Element> iter = plist.iterator(); iter.hasNext();) {
					Element pe = (Element) iter.next();
					String style = pe.getChildText("style");
					String text = getChildText(pe, "text");
					
					if (style.matches(start) ) {
						// process the name tag
						String sm= styleMappings.getProperty(style);//hong 6/26/08
						parseNameTag(count - 1, sm, text, treatment);
					}else if  (style.matches(names)) {
						// process the  synonym name tag
						String sm= styleMappings.getProperty(style);//hong 6/26/08
						parseSynTag(sm, text, treatment);
					}else if  (style.indexOf("Text") >= 0) {//hong 6/26/08
						// process the description, distribution, discussion tag
						if(text.trim().compareTo("") !=0){
							textcount++;
							ptexttag = parseTextTag(textcount, text, treatment, count, ptexttag);
						}
					}else {
						String sm = styleMappings.getProperty(style);
						Element e = new Element(sm);
						e.setText(text);
						treatment.addContent(e);
						/*
						text=text.replaceFirst("SELECTED REFERENCES?", "").trim();
						//end text format change++++++++++++++++++++++++++++++++++++++++++++++
						Matcher refM=Pattern.compile("([A-Z]\\w*?,? [A-Z]\\.)+(.*?)\\.(?=\\s[A-Z]\\w*?,? [A-Z]\\.(,|\\s?\\d{4}|\\s?[A-Z]\\.)|$)").matcher(text);
						while(refM.find()){
							addElement("reference",refM.group(1),e);
						}
						//e.setText(text);
						//Start text format change++++++++++++++++++++++++++++++++++++++++++++++++++
						
						//keys
						
						Element initial = new Element("initial_state");
						Element states = new Element("state");
						Element nextsteps = new Element("next_step");
						
						
						if(sm.equalsIgnoreCase("run_in_sidehead")){
							e2 = new Element("key");
							e2.setAttribute(new Attribute("name", text));
							treatment.addContent(e2);
							idlist.clear();
						}
						else if(sm.equals("key")){
							Element e1 = new Element("couplet");
							if (text.contains(" v. ") && text.contains(" p. ")
									&& !text.contains("Group ")) {
								split = text.split("[0-9]+[a-z]?\\. ");
								split1 = split[0].split("\\.");
								preid = split1[0];
								state = split[0].replace(preid + ".", "");
								nextstep = text.replace(split[0], "");
								idstorage = preid;
								Iterator iditerator = idlist.iterator();
								iteratorcount = 0;
								while(iditerator.hasNext()){
									String itemid = (String)iditerator.next();
									if(itemid.equalsIgnoreCase(preid)){
										iteratorcount++;
									}
								}
								id = preid + latin[iteratorcount];
								idlist.add(preid);
								nextsteps.setText(nextstep);
								// System.out.println(nextstep);
							} else if (text.contains(" v. ")
									&& text.contains(" p. ")
									&& text.contains("Group ")) {
								split = text.split("Group [0-9]");
								split1 = split[0].split("\\.");
								preid = split1[0];
								state = split[0].replace(preid + ".", "");
								nextstep = text.replace(split[0], "");
								idstorage = preid;
								Iterator iditerator = idlist.iterator();
								iteratorcount = 0;
								while(iditerator.hasNext()){
									String itemid = (String)iditerator.next();
									if(itemid.equalsIgnoreCase(preid)){
										iteratorcount++;
									}
								}
								id = preid + latin[iteratorcount];
								idlist.add(preid);
								nextsteps.setText(nextstep);
								// System.out.println(nextstep);
							} else if (!text.contains("Shifted to left margin.")&&text.contains("")) {
								split1 = text.split("\\.");
								preid = split1[0];
								state = text.replace(preid + ".", "");
								try{
								nextstepid = Integer.parseInt(idstorage) + 1;
								}catch(Exception excep){
									continue;
								}
								nextstep = nextstepid + "a";
								idstorage = preid;
								Iterator iditerator = idlist.iterator();
								iteratorcount = 0;
								while(iditerator.hasNext()){
									String itemid = (String)iditerator.next();
									if(itemid.equalsIgnoreCase(preid)){
										iteratorcount++;
									}
								}
								id = preid + latin[iteratorcount];
								idlist.add(preid);
								nextsteps.setAttribute(new Attribute("id", nextstep));
								//nextstep = nextid + "a";
								 //System.out.println(preid + "   " + state + "   " + nextstep);
							}
							
							initial.setAttribute(new Attribute("id", id));
							states.setText(state);
							
							e1.addContent(initial);
							e1.addContent(states);
							e1.addContent(nextsteps);
							e2.addContent(e1);
						}
						else{
							e.setName(sm);
							e.setText(text);
							treatment.addContent(e);
						}
							

						*/
						
	
					}
				}
			
				//further mark up reference
				List<Element> elements = XPath.selectNodes(treatment, "./references");
				Iterator<Element> it = elements.iterator();
				while(it.hasNext()){
					Element ref = it.next();
					furtherMarkupReference(ref);
				}
				
				//further mark up keys <run_in_sidehead>
				elements = XPath.selectNodes(treatment, "./key|./couplet");
				if(elements.size()>0){//contains key
					furtherMarkupKeys(treatment);
				}
				
				
				// output the treatment to transformed
				File xml = new File(Registry.TargetDirectory,
						ApplicationUtilities.getProperty("TRANSFORMED") + System.getProperty("file.separator") + count + ".xml");
				ParsingUtil.outputXML(treatment, xml ,null);
				//String error = (String)errors.get(count+"");
				//error = error ==null? "":error;
				
				// output the description part to Registry.descriptions 08/04/09
				List<Element> textList = XPath.selectNodes(treatment, "./description");
				StringBuffer buffer = new StringBuffer("");
				for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) {
					Element wt = (Element) ti.next();
					buffer.append(wt.getText()).append(" ");
				}
				String text = buffer.toString().replaceAll("\\s+", " ").trim();
				outputElementText(count, text, "DESCRIPTIONS");
				
				// output the habitat part to Registry.habitat 08/04/09
				textList = XPath.selectNodes(treatment, "./habitat");
				buffer = new StringBuffer("");
				for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) {
					Element wt = (Element) ti.next();
					buffer.append(wt.getText()).append(" ");
				}
				text = buffer.toString().replaceAll("\\s+", " ").trim();
				outputElementText(count, text, "HABITATS");
				
				
				//listener.info(String.valueOf(count), xml.getPath(), error);
				listener.progress((count*50) / total);
			}
			
			//HabitatParser4FNA hpf = new HabitatParser4FNA(dataPrefix);
			//hpf.parse();
			//VolumeFinalizer vf = new VolumeFinalizer(listener,null, null, this.conn,null, null);//display output files to listener here.
			//vf.replaceWithAnnotated(hpf, "/treatment/habitat", "TRANSFORMED", true);
		} catch (Exception e) {
			LOGGER.error("VolumeTransformer : transform - error in parsing", e);
			e.printStackTrace();
			throw new ParsingException(e);
		}
	}

	/**
	 * First assemble the key element(s) <key></key>
	 * Then turn individual statement :
	 *  <key>2. Carpels and stamens more than 5; plants perennial; leaves alternate; inflorescences ax-</key>
  	 *	<key>illary, terminal, or leaf-opposed racemes or spikes ### 3. Phytolac ca ### (in part), p. 6</key>
     * to:
     * <key_statement>
     * <statement_id>2</statement_id>
     * <statement>Carpels and stamens more than 5; 
     * plants perennial; leaves alternate; inflorescences ax-illary, terminal, 
     * or leaf-opposed racemes or spikes</statement>
     * <determination>3. Phytolacca (in part), p. 6</determination>
     * </key_statement>
     * 
     * <determination> is optional, and may be replaced by <next_statement_id>.
	 * @param treatment
	 */
	private void furtherMarkupKeys(Element treatment) {
		assembleKeys(treatment);
		try{
			List<Element> keys = XPath.selectNodes(treatment, "./TaxonKey");
			for(Element key: keys){
				furtherMarkupKeyStatements(key);
			}
		}catch(Exception e){
			e.printStackTrace();
		}
		
	}
	
	/* Turn individual statement :
	 *  <key>2. Carpels and stamens more than 5; plants perennial; leaves alternate; inflorescences ax-</key>
  	 *	<key>illary, terminal, or leaf-opposed racemes or spikes ### 3. Phytolac ca ### (in part), p. 6</key>
     * To:
     * <key_statement>
     * <statement_id>2</statement_id>
     * <statement>Carpels and stamens more than 5; 
     * plants perennial; leaves alternate; inflorescences ax-illary, terminal, 
     * or leaf-opposed racemes or spikes</statement>
     * <determination>3. Phytolacca (in part), p. 6</determination>
     * </key_statement>
     * 
     * <determination> is optional, and may be replaced by <next_statement_id>.
	 * @param treatment
	 */
	private void furtherMarkupKeyStatements(Element taxonkey) {
		ArrayList<Element> allstatements = new ArrayList<Element>();
		Element marked = new Element("key");
		List<Element> states = taxonkey.getChildren();
		Pattern p1 = Pattern.compile("(.*?)(( ### [\\d ]+[a-z]?\\.| ?#* ?Group +\\d).*)");//determ
		Pattern p2 = Pattern.compile("^([\\d ]+[a-z]?\\..*?) (.? ?[A-Z].*)");//id   2. "Ray� corollas
		String determ = null;
		String id = "";
		String broken = "";
		String preid = null;
		//process statements backwards
		for(int i = states.size()-1; i>=0; i--){
			Element state = states.get(i);
			if(state.getName().compareTo("key") == 0 || state.getName().compareTo("couplet") == 0){
				String text = state.getTextTrim()+broken;
				Matcher m = p1.matcher(text);
				if(m.matches()){
					text = m.group(1).trim();
					determ = m.group(2).trim();
				}
				m = p2.matcher(text);
				if(m.matches()){//good, statement starts with an id
					id = m.group(1).trim();
					text = m.group(2).trim();
					broken = "";
					//form a statement
					Element statement = new Element("key_statement");
					Element stateid = new Element("statement_id");
					stateid.setText(id.replaceAll("\\s*###\\s*", ""));
					Element stmt = new Element("statement");
					stmt.setText(text.replaceAll("\\s*###\\s*", ""));
					Element dtm = null;
					Element nextid = null;
					if(determ!=null) {
						dtm = new Element("determination");
						dtm.setText(determ.replaceAll("\\s*###\\s*", ""));
						determ = null;
					}else if(preid!=null){
						nextid = new Element("next_statement_id");
						nextid.setText(preid.replaceAll("\\s*###\\s*", ""));
						//preid = null;
					}
					preid = id;
					statement.addContent(stateid);
					statement.addContent(stmt);
					if(dtm!=null) statement.addContent(dtm);
					if(nextid!=null) statement.addContent(nextid);
					allstatements.add(statement);
				}else if(text.matches("^[a-z]+.*")){//a broken statement, save it
					broken = text;
				}
			}else{
				Element stateclone = (Element)state.clone();
				if(stateclone.getName().compareTo("run_in_sidehead")==0){
					stateclone.setName("key_head");
				}
				allstatements.add(stateclone);//"discussion" remains
			}
		}
		
		for(int i = allstatements.size()-1; i >=0; i--){
			marked.addContent(allstatements.get(i));
		}		
		taxonkey.getParentElement().addContent(marked);
		taxonkey.detach();
	}


	/**
	 * <treatment>
	 * <...>
	 * <references>...</references>
	 * <key>...</key>
	 * </treatment>
	 * deals with two cases:
	 * 1. the treatment contains one key with a set of "key/couplet" statements (no run_in_sidehead tags)
	 * 2. the treatment contains multiple keys that are started with <run_in_sidehead>Key to xxx (which may be also used to tag other content)
	 * @param treatment
	 */
	private void assembleKeys(Element treatment) {
		Element key = null;
		//removing individual statements from treatment and putting them in key
		List<Element> children = treatment.getChildren();////changes to treatment children affect elements too.
		Element[] elements = children.toArray(new Element[0]); //take a snapshot
		ArrayList<Element> detacheds = new ArrayList<Element>();
		boolean foundkey = false;
		for(int i = 0; i < elements.length; i++){
			Element e = elements[i];
			if(e.getName().compareTo("run_in_sidehead")==0 && (e.getTextTrim().startsWith("Key to ") || e.getTextTrim().matches("Group \\d+.*"))){
				foundkey = true;
				if(key!=null){
					treatment.addContent((Element)key.clone());	
				}
				key = new Element("TaxonKey");
			}
			if(!foundkey && (e.getName().compareTo("key")==0 || e.getName().compareTo("couplet")==0)){
				foundkey = true;	
				if(key==null){
					key = new Element("TaxonKey");
				}
			}
			if(foundkey){
				detacheds.add(e);
				key.addContent((Element)e.clone());
			}			
		}
		if(key!=null){
			treatment.addContent(key);					
		}
		for(Element e: detacheds){
			e.detach();
		}
	}


	/**
	 * turn
	 * <references>SELECTED REFERENCES Behnke, H.-D., C. Chang, I. J. Eifert, and T. J. Mabry. 1974. Betalains and P-type sieve-tube plastids in Petiveria and Agdestis (Phytolaccaceae). Taxon 23: 541�542. Brown, G. K. and G. S. Varadarajan. 1985. Studies in Caryophyllales I: Re-evaluation of classification of Phytolaccaceae s.l. Syst. Bot. 10: 49�63. Heimerl, A. 1934. Phytolaccaceae. In: H. G. A. Engler et al., eds. 1924+. Die nat�rlichen Pflanzenfamilien�, ed. 2. 26+ vols. Leipzig and Berlin. Vol. 16c, pp. 135�164. Nowicke, J. W. 1968. Palynotaxonomic study of the Phytolaccaceae. Ann. Missouri Bot. Gard. 55: 294�364. Rogers, G. K. 1985. The genera of Phytolaccaceae in the southeastern United States. J. Arnold Arbor. 66: 1�37. Thieret, J. W. 1966b. Seeds of some United States Phytolaccaceae and Aizoaceae. Sida 2: 352�360. Walter, H. P. H. 1906. Die Diagramme der Phytolaccaceen. Leipzig. [Preprinted from Bot. Jahrb. Syst. 37(suppl.): 1�57.] Walter, H. P. H. 1909. Phytolaccaceae. In: H. G. A. Engler, ed. 1900�1953. Das Pflanzenreich�. 107 vols. Berlin. Vol. 39[IV,83], pp. 1�154. Wilson, P. 1932. Petiveriaceae. In: N. L. Britton et al., eds. 1905+. North American Flora�. 47+ vols. New York. Vol. 21, pp. 257�266.</references>
	 * to
	 * <references><reference>Behnke, H.-D., C. Chang, I. J. Eifert, and T. J. Mabry. 1974. Betalains and P-type sieve-tube plastids in Petiveria and Agdestis (Phytolaccaceae). Taxon 23: 541�542. </reference> <reference>...</reference>....</references>
	 * @param ref
	 * @return
	 */
	private void furtherMarkupReference(Element ref) {
		//Element marked = new Element("references");
		String text = ref.getText();
		ref.setText("");
		if(this.debugref) System.out.println("\nReferences text:"+text);
		Pattern p = Pattern.compile("(.*?\\d+�\\d+\\.\\]?)(\\s+[A-Z]\\w+,.*)");
		Matcher m = p.matcher(text);
		while(m.matches()){
			String refstring = m.group(1);
			Element refitem = new Element("reference");
			refitem.setText(refstring);
			ref.addContent(refitem);
			if(this.debugref) System.out.println("a ref:"+refstring);
			text = m.group(2);
			m = p.matcher(text);
		}
		Element refitem = new Element("reference");
		refitem.setText("item:"+text);
		ref.addContent(refitem);
		if(this.debugref) System.out.println("a ref:"+text);
		//ref.getParentElement().addContent(marked);
		//ref.detach();	
	}


	private String getChildText(Element pe, String string) throws Exception{
		// TODO Auto-generated method stub
		StringBuffer buffer=new StringBuffer();
		List<Element> textList = XPath.selectNodes(pe, "./"+string);
		for (Iterator <Element> ti = textList.iterator(); ti.hasNext();) {
			Element wt = (Element) ti.next();
			buffer.append(wt.getText()).append(" ");
		}
		return buffer.toString().replaceAll("\\s+", " ").trim();
	}

	private String parseTextTag(int textcount, String text, Element treatment, int filecount, String ptag){

		String tag = "";
		Pattern organpt = Pattern.compile("\\b("+this.organnamep+"|"+this.organnames+")\\b", Pattern.CASE_INSENSITIVE);
		Matcher m = organpt.matcher(text);
		int organcount = 0;
		while(m.find()){
			////System.out.println(m.group());
			organcount++;
		}
		if(textcount ==1 && organcount >=2){
			tag = "description";
			addElement("description", text, treatment);
			//outputDescriptionText(filecount, text); //hong: 08/04/09 take this function out. FOC descriptions are not part of TEXT.
		}else if((textcount ==1 && organcount < 2)){
			tag = "distribution";
			//TODO: further markup distribution to: # of infrataxa, introduced, generalized distribution, flowering time,habitat, elevation, state distribution, global distribution 
			//addElement("distribution", text, treatment);
			parseDistriTag(text, treatment);
		}//else if(ptag.compareTo("distribution")==0){
		else if(ptag.compareTo("description")==0){//hong: 3/11/10 for FNA v19
			tag = "distribution";
			//TODO: further markup distribution to: # of infrataxa, introduced, generalized distribution, flowering time,habitat, elevation, state distribution, global distribution 
			//addElement("distribution", text, treatment);
			parseDistriTag(text, treatment);
		}else if(ptag.compareTo("distribution")==0||ptag.compareTo("discussion")==0){
			tag = "discussion";
			addElement("discussion", text, treatment);
			//System.out.println("discussion:"+text);
		}
		return tag;
		
	}
	/**
	 * further markup distribution to: (species-with infrataxa and higher)
	 * # of infrataxa, introduced, generalized distribution, 
	 * or (species-without infrataxa and lower)
	 * flowering time,habitat, elevation, state distribution, global distribution 
	 * @param text
	 * @param treatment
	 */
	private void parseDistriTag(String text, Element treatment){
		//System.out.println("::::::::::::::::::::::::::::::::::\ndistribution: "+text);
		Pattern rankp = Pattern.compile("^((?:Genera|Genus|Species|Subspecies|Varieties|Subgenera).*?:)\\s*(introduced\\s*;)?(.*)");
		Matcher m = rankp.matcher(text);
		if(m.matches()){//species and higher
			if(m.group(1) != null){
				addElement("number_of_infrataxa",m.group(1), treatment);
				//System.out.println("number_of_infrataxa:"+m.group(1));
			}
			if(m.group(2)!=null){
					addElement("introduced", m.group(2), treatment);
					//System.out.println("introduced:"+m.group(2));
			}
			if(m.group(3) != null){
					//addElement("general_distribution", m.group(3), treatment);
					//further markkup distribution
					DistributionParser4FNA dp = new DistributionParser4FNA(treatment, m.group(3), "general_distribution");
					treatment = dp.parse(); 
					//System.out.println("general_distribution:"+m.group(3));
			}	
		}else{//species and lower
			Pattern h = Pattern.compile("(Flowering.*?\\.)?(.*?(?:;|\\.$))?(\\s*of conservation concern\\s*(?:;|\\.$))?(.*?\\b(?:\\d+|m)\\b.*?(?:;|\\.$))?\\s*(introduced(?:;|\\.$))?(.*)");
			Matcher mh = h.matcher(text);
			if(mh.matches()){//TODO:habitat, elevation, state distribution, global distribution
				if(mh.group(1) != null){
					//addElement("flowering_time",mh.group(1), treatment);
					//further markkup distribution
					FloweringTimeParser4FNA dp = new FloweringTimeParser4FNA(treatment, mh.group(1), "flowering_time");
					treatment = dp.parse(); 
					//System.out.println("flowering_time:"+mh.group(1));
				}
				if(mh.group(2)!= null){
					addElement("habitat",mh.group(2), treatment);
					//System.out.println("habitat:"+mh.group(2));
				}
				if(mh.group(3)!= null){
					addElement("conservation",mh.group(3), treatment);
					//System.out.println("conservation:"+mh.group(3));
				}
				if(mh.group(4)!= null){
					addElement("elevation",mh.group(4), treatment);
					//System.out.println("elevation:"+mh.group(4));
				}
				if(mh.group(5)!= null){
					addElement("introduced",mh.group(5), treatment);
					//System.out.println("introduced:"+mh.group(5));
				}
				if(mh.group(6)!= null){
					String[] distrs = mh.group(6).split(";");
					for(int i= 0; i<distrs.length; i++){
						if(distrs[i].matches(".*?\\b("+this.usstates+")(\\W|$).*")){
							//addElement("us_distribution",distrs[i], treatment);
							//further markkup distribution
							DistributionParser4FNA dp = new DistributionParser4FNA(treatment, distrs[i], "us_distribution");
							treatment = dp.parse(); 
							//System.out.println("us_distribution:"+distrs[i]);
						}else if(distrs[i].matches(".*?\\b("+this.caprovinces+")(\\W|$).*")){
							//addElement("ca_distribution",distrs[i], treatment);
							//further markkup distribution
							DistributionParser4FNA dp = new DistributionParser4FNA(treatment, distrs[i], "ca_distribution");
							treatment = dp.parse(); 
							//System.out.println("ca_distribution:"+distrs[i]);
						}else{
							//addElement("global_distribution",distrs[i], treatment);
							//further markkup distribution
							DistributionParser4FNA dp = new DistributionParser4FNA(treatment, distrs[i], "global_distribution");
							treatment = dp.parse(); 
							//System.out.println("global_distribution:"+distrs[i]);
						}
					}
				}
			}else{
				System.err.println("distribution not match: "+text);
			}
			
			
		}
	}
	
	private void parseSynTag(String tag, String text, Element treatment){
		Element e = treatment.getChild("variety_name");
		if(e != null){
			tag = "synonym_of_variety_name";
		}else if((e = treatment.getChild("subspecies_name"))!=null){
			tag = "synonym_of_subspecies_name";
		}else if((e = treatment.getChild("species_name"))!=null){
			tag = "synonym_of_species_name";
		}else if((e = treatment.getChild("tribe_name"))!=null){
			tag = "synonym_of_tribe_name";
		}else if((e = treatment.getChild("genus_name"))!=null){
			tag = "synonym_of_genus_name";
		}
		
		addElement(tag, text, treatment);
		//System.out.println(tag+":"+text);
	}
	
	private String parseNameTag(int index, String namerank, String line,
			Element treatment) {
		if(line == null || line.equals("")){
			return ""; //TODO: should not happen. but did happen with v. 19 295.xml==>VolumeExtractor JDOM problem.
		}
		
		String name = ti.getName(index);
		if(name==null ||name.compareTo("") == 0){
			File xml = new File(Registry.TargetDirectory,
					ApplicationUtilities.getProperty("TRANSFORMED") + System.getProperty("file.separator") + (index+1) + ".xml");
			listener.info("no name found in: ", xml.getPath());
			//errors.put((index+1)+"","no name found in: "+line);
			return "";
		}
		// make a copy of the line and will work on the new copy
		String text = new String(line);
		text = text.replaceAll("�", " ").replaceAll("\\s+", " ").trim(); //there are some whitespaces that are not really a space, don't know what they are. 
		if(debug) System.out.println("\n"+(index+1)+": text="+text);
		
		String number = null;
		if(ti != null)
			number = ti.getNumber(index);
		else{
			number = line.substring(0, line.indexOf('.'));
		}
		// number
		addElement("number", number, treatment); // TODO: add the number tag
		                                         // to the sytle mapping

		//text = text.substring(number.length() + 1); //Hong 08/04/09 change to
		text = VolumeVerifier.fixBrokenNames(text);
		text = text.replaceFirst("^.*?(?=[A-Z])", "").trim();;
		
		//namerank and name
		//(subfam|var|subgen|subg|subsp|ser|tribe|subsect)
		if(namerank.indexOf("species_subspecies_variety_name")>=0){
			if(text.indexOf("var.") >=0){
				namerank = "variety_name";
			}else if(text.indexOf("subsp.") >=0){
				namerank = "subspecies_name";
			}else if(text.indexOf("ser.") >=0){
				namerank = "series_name";
			}else if(text.indexOf("sect.") >=0){
				namerank = "section_name";
			}else if(text.indexOf("subsect.") >=0){
				namerank = "subsection_name";
			}else {
				namerank = "species_name";
			}
		}
		if(debug) System.out.println("namerank:"+namerank);
		String[] nameinfo = getNameAuthority(name);
		if(nameinfo[0]!=null && nameinfo[1]!=null){
		addElement(namerank, nameinfo[0], treatment);
		try {
			vtDbA.add2TaxonTable(number, name, namerank, index+1);
		} catch (ParsingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
		}
		if(debug) System.out.println("name:"+nameinfo[0]);
		if(nameinfo[1].length()>0){
			addElement("authority", nameinfo[1], treatment);
			try {
				vtDbA.add2AuthorTable(nameinfo[1]);
			} catch (ParsingException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
			} catch (SQLException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
			}
			if(debug) System.out.println("authority:"+nameinfo[1]);
		}
		text = text.replaceFirst("^\\s*.{"+name.length()+"}","").trim();
		}
		//authority
		/*Pattern p = Pattern.compile("(.*?)((?: in|,|·|\\?).*)");
		Matcher m = p.matcher(text);
		if(m.matches()){
			if(m.group(1).trim().compareTo("")!= 0){
				addElement("authority", m.group(1).trim(), treatment);
				try {
					vtDbA.add2AuthorTable(m.group(1).trim());
				} catch (ParsingException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
				} catch (SQLException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
					LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
				}
				//System.out.println("authority:"+m.group(1).trim());
			}
			text = m.group(2).trim();
		}*/
		//save the segment after ?or ?for later
		/*String ending = "";
		int pos = text.lastIndexOf('.');
		if(pos < 0){
			pos = text.lastIndexOf('?');
		}
		if (pos != -1) {
			ending = text.substring(pos + 1).trim();
			text = text.substring(0, pos+1);
		}*/
		
		//derivation: deal with this first to remove [] and avoid pub-year match in []
		Pattern p = Pattern.compile("(.*?)(\\[.*?\\]$)");
		Matcher m = p.matcher(text);
		if(m.matches()){
			if(m.group(2).trim().compareTo("")!= 0){
				addElement("etymology", m.group(2).trim(), treatment);
				if(debug) System.out.println("etymology:"+m.group(2).trim());
			}
			text = m.group(1).trim();
		}
		
		//place of publication 
		//Pattern p = Pattern.compile("(.* [12]\\d\\d\\d|.*(?=·)|.*(?=.))(.*)"); //TODO: a better fix is needed Brittonia 28: 427, fig. 1.  1977   ?  Yellow spinecape [For George Jones Goodman, 1904-1999
		p = Pattern.compile("(.* [12]\\d\\d\\d)($|,|\\.| +)(.*)"); //TODO: a better fix is needed Brittonia 28: 427, fig. 1.  1977   ?  Yellow spinecape [For George Jones Goodman, 1904-1999
		m = p.matcher(text);
		if(m.matches()){
			String pp = m.group(1).replaceFirst("^\\s*[,\\.]", "").trim();			
			extractPublicationPlace(treatment, pp); //pp may be "Sp. Pl. 1: 480.  1753; Gen. Pl. ed. 5, 215.  1754"
			text = m.group(3).trim();
		}

		// conserved
		String conserved="name conserved";
		int	pos = text.indexOf(conserved);
		if(pos < 0){
			conserved="name proposed for conservation";
			pos = text.indexOf(conserved);
		}
		if(pos < 0){
			conserved="nom. cons.";
			pos = text.indexOf(conserved);
		}
		if (pos != -1) {
			//String conserved = text.substring(pos).trim();
			text = text.replace(conserved, "").trim();
			//conserved = conserved.replaceFirst("^\\s*[,;\\.]", "");
			addElement("conserved", conserved, treatment);
			if(debug) System.out.println("conserved:"+conserved);
			
			// trim the text
			//int p1 = text.lastIndexOf(',', pos);
			//text = text.substring(0, p1);
		}

		//past_name
		p = Pattern.compile("\\((?:as )?(.*?)\\)(.*)");
		m = p.matcher(text);
		if(m.matches()){
			if(m.group(1).trim().compareTo("")!= 0){
				addElement("past_name", m.group(1).trim(), treatment);
				if(debug) System.out.println("past_name:"+m.group(1).trim());
			}
			text = m.group(2).trim();
		}

		//common name
		p = Pattern.compile("(.*?)[��](.*?)(\\[.*|$)");
		m = p.matcher(text);
		if(m.matches()){
			if(m.group(2).trim().compareTo("")!= 0){
				String[] commonnames = m.group(2).trim().split("\\s*,\\s*");
				for(String cname: commonnames){
					addElement("common_name", cname, treatment);
					if(debug) System.out.println("common_name:"+cname);
				}
			}
			text = (m.group(1)+" "+m.group(3)).trim();
		}

		// format mark, common name, derivation
		/*{
			//int pos = text.lastIndexOf('?);
			//if(pos < 0){
			//	pos = text.lastIndexOf('?);
			//}
			if (ending.compareTo("") != 0) {
				//String ending = text.substring(pos + 1).trim();
				String[] results = ending.split("\\[");

				String commonName = results[0].trim();
				addElement("common_name", commonName, treatment);
				//System.out.println("common_name:"+commonName);

				if (results.length > 1) {
					String derivation = results[1].trim();
					derivation = derivation.substring(0,
							derivation.length() - 1); // remove the last ']'
					addElement("derivation", derivation, treatment);
					//System.out.println("derivation:"+derivation);
				}
				
				//text = text.substring(0, pos).trim();
			}
		}*/
		

		if(text.trim().matches(".*?\\w+.*")){
			if(debug) System.out.println((index+1)+"unparsed: "+text);
			addElement("unparsed", text, treatment);
			File xml = new File(Registry.TargetDirectory,
					ApplicationUtilities.getProperty("TRANSFORMED") + System.getProperty("file.separator") + (index+1) + ".xml");
			listener.info("unparsed: "+text, xml.getPath());
			//errors.put((index+1)+"","still left: "+text);
		}
		return namerank.replace("_name", "");
	}


	/**
	 * family, genus, species has authority
	 * lower ranked taxon have authorities in names themselves
	 * 
	 * Cactaceae Jussieu subfam. O puntioideae Burnett
	 * @param name
	 * @return
	 */
	private String[] getNameAuthority(String name) {
		String[] nameinfo = new String[2];
		if(name.matches(".*?\\b(subfam|var|subgen|subg|subsp|ser|tribe|sect|subsect)\\b.*")){
			nameinfo[0] = name;
			nameinfo[1] = "";
			return nameinfo;
		}
		//family
		Pattern p = Pattern.compile("^([a-z]*?ceae)(\\b.*)", Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(name);
		if(m.matches()){
			nameinfo[0] = m.group(1).replaceAll("\\s", "").trim(); //in case an extra space is there
			nameinfo[1] = m.group(2).trim();
			return nameinfo;
		}
		//genus
		p = Pattern.compile("^([A-Z][A-Z].*?)(\\b.*)"); 
		m = p.matcher(name);
		if(m.matches()){
			nameinfo[0] = m.group(1).replaceAll("\\s", "").trim();
			nameinfo[1] = m.group(2).trim();
			return nameinfo;
		}
		//species
		p = Pattern.compile("^([A-Z].*?)\\s+([(A-Z].*)");
		m = p.matcher(name);
		if(m.matches()){
			nameinfo[0] = m.group(1).trim();
			nameinfo[1] = m.group(2).trim();
			return nameinfo;
		}
		
		
		return nameinfo;
	}


	private void extractPublicationPlace(Element treatment, String pp) {
		pp = pp.replaceFirst("^\\s*,", "").trim();
		String pub="";
		String pip="";
		String[] pps = pp.split(";");
		for(String apub: pps){
			String place_in_publication="(.*?)(\\d.*?)";
			Matcher pubm=Pattern.compile(place_in_publication).matcher(apub);
			if(pubm.matches()){
				pub=pubm.group(1).trim();
				pip=pubm.group(2).trim();
			}
						
			Element placeOfPub=new Element("place_of_publication");
			addElement("publication_title",pub,placeOfPub);
			addElement("place_in_publication",pip,placeOfPub);
			treatment.addContent(placeOfPub);
			if(debug) System.out.println("publication_title:"+pub);
			if(debug) System.out.println("place_in_publication:"+pip);
			
			try {
				vtDbA.add2PublicationTable(pub);
			} catch (ParsingException e) {
				e.printStackTrace();
				LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
			} catch (SQLException e) {
				e.printStackTrace();
				LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
			}
		}
	}

	private static void addElement(String tag, String text, Element parent) {
		Element e = new Element(tag);
		e.setText(text);
		parent.addContent(e);
	}

	private void outputElementText(int count, String text, String elementname) throws ParsingException {
		//System.out.println("write file "+count+".txt");
		//elementname = "DESCRIPTIONS"
		try {
			File file = new File(Registry.TargetDirectory,
					ApplicationUtilities.getProperty(elementname) + System.getProperty("file.separator")+ count + ".txt");
			BufferedWriter out = new BufferedWriter(new FileWriter(file));
			out.write(text);
			out.close(); // don't forget to close the output stream!!!
		} catch (IOException e) {
			e.printStackTrace();
			LOGGER.error("Failed to output text file in VolumeTransformer:outputDescriptionText", e);
			throw new ParsingException("Failed to output text file.", e);
		}
	}
	


}