/**
* $Id: VolumeTransformer.java 996 2011-10-07 01:13:47Z hong1.cui $
*/
package fna.parsing;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;
import fna.db.VolumeTransformerDbAccess;
/**
* To transform the extracted data to the xml format.
*
* Note: before the transformation, the data should pass the check without
* error.
*
* @author chunshui
*/
@SuppressWarnings({ "unchecked", "unused","static-access" })
public class VolumeTransformer extends Thread {
private static String organnames ="2n|achene|anther|apex|awn|ax|bark|beak|blade|bract|bracteole|branch|branchlet|broad|calyx|capsule|cap_sule|caropohore|carpophore|caudex|cluster|corolla|corona|crown|cup_|cusp|cyme|cymule|embryo|endosperm|fascicle|filament|flower|fruit|head|herb|homophyllous|hypanthium|hypanth_ium|indument|inflore|inflorescence|inflores_cence|inflo_rescence|internode|involucre|invo_lucre|in_florescence|in_ternode|leaf|limb|lobe|margin|midvein|nectary|node|ocrea|ocreola|ovary|ovule|pair|papilla|pedicel|pedicle|peduncle|perennial|perianth|petal|petiole|plant|prickle|rhizome|rhi_zome|root|rootstock|rosette|scape|seed|sepal|shoot|spikelet|spur|stamen|stem|stigma|stipule|sti_pule|structure|style|subshrub|taproot|taprooted|tap_root|tendril|tepal|testa|tooth|tree|tube|tubercle|tubercule|tuft|twig|utricle|vein|vine|wing|x";
private static String organnamep ="achenes|anthers|awns|axes|blades|bracteoles|bracts|branches|buds|bumps|calyces|capsules|clusters|crescents|crowns|cusps|cymes|cymules|ends|escences|fascicles|filaments|flowers|fruits|heads|herbs|hoods|inflores|inflorescences|internodes|involucres|leaves|lengths|limbs|lobes|margins|midribs|midveins|nectaries|nodes|ocreae|ocreolae|ovules|pairs|papillae|pedicels|pedicles|peduncles|perennials|perianths|petals|petioles|pistils|plants|prickles|pules|rescences|rhizomes|rhi_zomes|roots|rows|scapes|seeds|sepals|shoots|spikelets|stamens|staminodes|stems|stigmas|stipules|sti_pules|structures|styles|subshrubs|taproots|tap_roots|teeth|tendrils|tepals|trees|tubercles|tubercules|tubes|tufts|twigs|utricles|veins|vines|wings";
private static String usstates ="Ala\\.|Alaska|Ariz\\.|Ark\\.|Calif\\.|Colo\\.|Conn\\.|Del\\.|D\\.C\\.|Fla\\.|Ga\\.|Idaho|Ill\\.|Ind\\.|Iowa|Kans\\.|Ky\\.|La\\.|Maine|Md\\.|Mass\\.|Mich\\.|Minn\\.|Miss\\.|Mo\\.|Mont\\.|Nebr\\.|Nev\\.|N\\.H\\.|N\\.J\\.|N\\.Mex\\.|N\\.Y\\.|N\\.C\\.|N\\.Dak\\.|Ohio|Okla\\.|Oreg\\.|Pa\\.|R\\.I\\.|S\\.C\\.|S\\.Dak\\.|Tenn\\.|Tex\\.|Utah|Vt\\.|Va\\.|Wash\\.|W\\.Va\\.|Wis\\.|Wyo\\.";
private static String caprovinces="Alta\\.|B\\.C\\.|Man\\.|N\\.B\\.|Nfld\\. and Labr|N\\.W\\.T\\.|N\\.S\\.|Nunavut|Ont\\.|P\\.E\\.I\\.|Que\\.|Sask\\.|Yukon";
private Properties styleMappings;
private TaxonIndexer ti;
private ProcessListener listener;
//private Hashtable errors;
//TODO: put the following in a conf file. same for those in volumeExtractor.java
//private String start = "^Heading.*"; //starts a treatment
private String start = VolumeExtractor.getStart(); //starts a treatment
private String names = ".*?(Syn|Name).*"; //other interesting names worth parsing
private String conservednamestatement ="(name conserved|nom. cons.)";
private static final Logger LOGGER = Logger.getLogger(VolumeTransformer.class);
private VolumeTransformerDbAccess vtDbA = null;
//private Hashtable<?, ?> ranks;
private String taxontable = null;
private String authortable = null;
private String publicationtable = null;
private Connection conn = null;
private String dataPrefix;
private boolean debug = false;
private boolean debugref = false;
private boolean debugkey = true;
public VolumeTransformer(ProcessListener listener, String dataPrefix) throws ParsingException {
this.listener = listener;
this.dataPrefix = dataPrefix;
//this.errors = new Hashtable();
this.taxontable = dataPrefix.trim()+"_" + ApplicationUtilities.getProperty("taxontable");
this.authortable = dataPrefix.trim() + "_" + ApplicationUtilities.getProperty("authortable");
this.publicationtable = dataPrefix.trim() + "_" + ApplicationUtilities.getProperty("publicationtable");
vtDbA = new VolumeTransformerDbAccess(dataPrefix);
ti = TaxonIndexer.loadUpdated(Registry.ConfigurationDirectory);
if(ti.emptyNumbers() || ti.emptyNames()) ti = null;
// load style mapping
styleMappings = new Properties();
try {
styleMappings.load(new FileInputStream(
Registry.ConfigurationDirectory+System.getProperty("file.separator")+"style-mapping.properties"));
} catch (IOException e) {
throw new ParsingException(
"Failed to load the style mapping file!", e);
}
try{
if(conn == null){
String URL = ApplicationUtilities.getProperty("database.url");
conn = DriverManager.getConnection(URL);
Statement stmt = conn.createStatement();
stmt.execute("drop table if exists "+taxontable);
stmt.execute("create table if not exists "+taxontable+" (taxonnumber varchar(10), name varchar(500), rank varchar(20), filenumber int)");
stmt.execute("drop table if exists "+authortable);
stmt.execute("create table if not exists "+ authortable+" (authority varchar(500) NOT NULL)");
stmt.execute("drop table if exists "+publicationtable);
stmt.execute("create table if not exists "+ publicationtable+" (publication varchar(500) NOT NULL)");
}
}catch(Exception e){
LOGGER.error("VolumeTransformer : Database error in constructor", e);
e.printStackTrace();
}
}
/**
* Transform the extracted data to the xml format.
*/
public void run() {
listener.setProgressBarVisible(true);
transform();
listener.setProgressBarVisible(false);
}
public void transform() throws ParsingException {
//add start
List<String> idlist = new ArrayList<String>();
int iteratorcount = 0;
String state = "", preid = "", id = "", nextstep = "";
String split[] = new String[3];
String split1[] = new String[30];
String latin[] = new String[300];
latin[0] = "a";
latin[1] = "b";
latin[2] = "c";
latin[3] = "d";
latin[4] = "e";
latin[5] = "f";
latin[6] = "g";
latin[7] = "h";
latin[8] = "i";
//add end
// get the extracted files list
File source = new File(Registry.TargetDirectory, ApplicationUtilities.getProperty("EXTRACTED"));
int total = source.listFiles().length;
listener.progress(1);
try {
for (int count = 1; count <= total; count++) {
File file = new File(source, count + ".xml");
// logger.info("Start to process: " + file.getName());
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
Element treatment = new Element("treatment");
Element e2 = new Element("key");
List<Element> plist = XPath.selectNodes(root, "/treatment/paragraph");
int textcount = 0, nextstepid = 0;
String ptexttag ="";
String idstorage = "1";
for (Iterator<Element> iter = plist.iterator(); iter.hasNext();) {
Element pe = (Element) iter.next();
String style = pe.getChildText("style");
String text = getChildText(pe, "text");
if (style.matches(start) ) {
// process the name tag
String sm= styleMappings.getProperty(style);//hong 6/26/08
parseNameTag(count - 1, sm, text, treatment);
}else if (style.matches(names)) {
// process the synonym name tag
String sm= styleMappings.getProperty(style);//hong 6/26/08
parseSynTag(sm, text, treatment);
}else if (style.indexOf("Text") >= 0) {//hong 6/26/08
// process the description, distribution, discussion tag
if(text.trim().compareTo("") !=0){
textcount++;
ptexttag = parseTextTag(textcount, text, treatment, count, ptexttag);
}
}else {
String sm = styleMappings.getProperty(style);
Element e = new Element(sm);
e.setText(text);
treatment.addContent(e);
/*
text=text.replaceFirst("SELECTED REFERENCES?", "").trim();
//end text format change++++++++++++++++++++++++++++++++++++++++++++++
Matcher refM=Pattern.compile("([A-Z]\\w*?,? [A-Z]\\.)+(.*?)\\.(?=\\s[A-Z]\\w*?,? [A-Z]\\.(,|\\s?\\d{4}|\\s?[A-Z]\\.)|$)").matcher(text);
while(refM.find()){
addElement("reference",refM.group(1),e);
}
//e.setText(text);
//Start text format change++++++++++++++++++++++++++++++++++++++++++++++++++
//keys
Element initial = new Element("initial_state");
Element states = new Element("state");
Element nextsteps = new Element("next_step");
if(sm.equalsIgnoreCase("run_in_sidehead")){
e2 = new Element("key");
e2.setAttribute(new Attribute("name", text));
treatment.addContent(e2);
idlist.clear();
}
else if(sm.equals("key")){
Element e1 = new Element("couplet");
if (text.contains(" v. ") && text.contains(" p. ")
&& !text.contains("Group ")) {
split = text.split("[0-9]+[a-z]?\\. ");
split1 = split[0].split("\\.");
preid = split1[0];
state = split[0].replace(preid + ".", "");
nextstep = text.replace(split[0], "");
idstorage = preid;
Iterator iditerator = idlist.iterator();
iteratorcount = 0;
while(iditerator.hasNext()){
String itemid = (String)iditerator.next();
if(itemid.equalsIgnoreCase(preid)){
iteratorcount++;
}
}
id = preid + latin[iteratorcount];
idlist.add(preid);
nextsteps.setText(nextstep);
// System.out.println(nextstep);
} else if (text.contains(" v. ")
&& text.contains(" p. ")
&& text.contains("Group ")) {
split = text.split("Group [0-9]");
split1 = split[0].split("\\.");
preid = split1[0];
state = split[0].replace(preid + ".", "");
nextstep = text.replace(split[0], "");
idstorage = preid;
Iterator iditerator = idlist.iterator();
iteratorcount = 0;
while(iditerator.hasNext()){
String itemid = (String)iditerator.next();
if(itemid.equalsIgnoreCase(preid)){
iteratorcount++;
}
}
id = preid + latin[iteratorcount];
idlist.add(preid);
nextsteps.setText(nextstep);
// System.out.println(nextstep);
} else if (!text.contains("Shifted to left margin.")&&text.contains("")) {
split1 = text.split("\\.");
preid = split1[0];
state = text.replace(preid + ".", "");
try{
nextstepid = Integer.parseInt(idstorage) + 1;
}catch(Exception excep){
continue;
}
nextstep = nextstepid + "a";
idstorage = preid;
Iterator iditerator = idlist.iterator();
iteratorcount = 0;
while(iditerator.hasNext()){
String itemid = (String)iditerator.next();
if(itemid.equalsIgnoreCase(preid)){
iteratorcount++;
}
}
id = preid + latin[iteratorcount];
idlist.add(preid);
nextsteps.setAttribute(new Attribute("id", nextstep));
//nextstep = nextid + "a";
//System.out.println(preid + " " + state + " " + nextstep);
}
initial.setAttribute(new Attribute("id", id));
states.setText(state);
e1.addContent(initial);
e1.addContent(states);
e1.addContent(nextsteps);
e2.addContent(e1);
}
else{
e.setName(sm);
e.setText(text);
treatment.addContent(e);
}
*/
}
}
//further mark up reference
List<Element> elements = XPath.selectNodes(treatment, "./references");
Iterator<Element> it = elements.iterator();
while(it.hasNext()){
Element ref = it.next();
furtherMarkupReference(ref);
}
//further mark up keys <run_in_sidehead>
elements = XPath.selectNodes(treatment, "./key|./couplet");
if(elements.size()>0){//contains key
furtherMarkupKeys(treatment);
}
// output the treatment to transformed
File xml = new File(Registry.TargetDirectory,
ApplicationUtilities.getProperty("TRANSFORMED") + System.getProperty("file.separator") + count + ".xml");
ParsingUtil.outputXML(treatment, xml ,null);
//String error = (String)errors.get(count+"");
//error = error ==null? "":error;
// output the description part to Registry.descriptions 08/04/09
List<Element> textList = XPath.selectNodes(treatment, "./description");
StringBuffer buffer = new StringBuffer("");
for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) {
Element wt = (Element) ti.next();
buffer.append(wt.getText()).append(" ");
}
String text = buffer.toString().replaceAll("\\s+", " ").trim();
outputElementText(count, text, "DESCRIPTIONS");
// output the habitat part to Registry.habitat 08/04/09
textList = XPath.selectNodes(treatment, "./habitat");
buffer = new StringBuffer("");
for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) {
Element wt = (Element) ti.next();
buffer.append(wt.getText()).append(" ");
}
text = buffer.toString().replaceAll("\\s+", " ").trim();
outputElementText(count, text, "HABITATS");
//listener.info(String.valueOf(count), xml.getPath(), error);
listener.progress((count*50) / total);
}
//HabitatParser4FNA hpf = new HabitatParser4FNA(dataPrefix);
//hpf.parse();
//VolumeFinalizer vf = new VolumeFinalizer(listener,null, null, this.conn,null, null);//display output files to listener here.
//vf.replaceWithAnnotated(hpf, "/treatment/habitat", "TRANSFORMED", true);
} catch (Exception e) {
LOGGER.error("VolumeTransformer : transform - error in parsing", e);
e.printStackTrace();
throw new ParsingException(e);
}
}
/**
* First assemble the key element(s) <key></key>
* Then turn individual statement :
* <key>2. Carpels and stamens more than 5; plants perennial; leaves alternate; inflorescences ax-</key>
* <key>illary, terminal, or leaf-opposed racemes or spikes ### 3. Phytolac ca ### (in part), p. 6</key>
* to:
* <key_statement>
* <statement_id>2</statement_id>
* <statement>Carpels and stamens more than 5;
* plants perennial; leaves alternate; inflorescences ax-illary, terminal,
* or leaf-opposed racemes or spikes</statement>
* <determination>3. Phytolacca (in part), p. 6</determination>
* </key_statement>
*
* <determination> is optional, and may be replaced by <next_statement_id>.
* @param treatment
*/
private void furtherMarkupKeys(Element treatment) {
assembleKeys(treatment);
try{
List<Element> keys = XPath.selectNodes(treatment, "./TaxonKey");
for(Element key: keys){
furtherMarkupKeyStatements(key);
}
}catch(Exception e){
e.printStackTrace();
}
}
/* Turn individual statement :
* <key>2. Carpels and stamens more than 5; plants perennial; leaves alternate; inflorescences ax-</key>
* <key>illary, terminal, or leaf-opposed racemes or spikes ### 3. Phytolac ca ### (in part), p. 6</key>
* To:
* <key_statement>
* <statement_id>2</statement_id>
* <statement>Carpels and stamens more than 5;
* plants perennial; leaves alternate; inflorescences ax-illary, terminal,
* or leaf-opposed racemes or spikes</statement>
* <determination>3. Phytolacca (in part), p. 6</determination>
* </key_statement>
*
* <determination> is optional, and may be replaced by <next_statement_id>.
* @param treatment
*/
private void furtherMarkupKeyStatements(Element taxonkey) {
ArrayList<Element> allstatements = new ArrayList<Element>();
Element marked = new Element("key");
List<Element> states = taxonkey.getChildren();
Pattern p1 = Pattern.compile("(.*?)(( ### [\\d ]+[a-z]?\\.| ?#* ?Group +\\d).*)");//determ
Pattern p2 = Pattern.compile("^([\\d ]+[a-z]?\\..*?) (.? ?[A-Z].*)");//id 2. "Ray� corollas
String determ = null;
String id = "";
String broken = "";
String preid = null;
//process statements backwards
for(int i = states.size()-1; i>=0; i--){
Element state = states.get(i);
if(state.getName().compareTo("key") == 0 || state.getName().compareTo("couplet") == 0){
String text = state.getTextTrim()+broken;
Matcher m = p1.matcher(text);
if(m.matches()){
text = m.group(1).trim();
determ = m.group(2).trim();
}
m = p2.matcher(text);
if(m.matches()){//good, statement starts with an id
id = m.group(1).trim();
text = m.group(2).trim();
broken = "";
//form a statement
Element statement = new Element("key_statement");
Element stateid = new Element("statement_id");
stateid.setText(id.replaceAll("\\s*###\\s*", ""));
Element stmt = new Element("statement");
stmt.setText(text.replaceAll("\\s*###\\s*", ""));
Element dtm = null;
Element nextid = null;
if(determ!=null) {
dtm = new Element("determination");
dtm.setText(determ.replaceAll("\\s*###\\s*", ""));
determ = null;
}else if(preid!=null){
nextid = new Element("next_statement_id");
nextid.setText(preid.replaceAll("\\s*###\\s*", ""));
//preid = null;
}
preid = id;
statement.addContent(stateid);
statement.addContent(stmt);
if(dtm!=null) statement.addContent(dtm);
if(nextid!=null) statement.addContent(nextid);
allstatements.add(statement);
}else if(text.matches("^[a-z]+.*")){//a broken statement, save it
broken = text;
}
}else{
Element stateclone = (Element)state.clone();
if(stateclone.getName().compareTo("run_in_sidehead")==0){
stateclone.setName("key_head");
}
allstatements.add(stateclone);//"discussion" remains
}
}
for(int i = allstatements.size()-1; i >=0; i--){
marked.addContent(allstatements.get(i));
}
taxonkey.getParentElement().addContent(marked);
taxonkey.detach();
}
/**
* <treatment>
* <...>
* <references>...</references>
* <key>...</key>
* </treatment>
* deals with two cases:
* 1. the treatment contains one key with a set of "key/couplet" statements (no run_in_sidehead tags)
* 2. the treatment contains multiple keys that are started with <run_in_sidehead>Key to xxx (which may be also used to tag other content)
* @param treatment
*/
private void assembleKeys(Element treatment) {
Element key = null;
//removing individual statements from treatment and putting them in key
List<Element> children = treatment.getChildren();////changes to treatment children affect elements too.
Element[] elements = children.toArray(new Element[0]); //take a snapshot
ArrayList<Element> detacheds = new ArrayList<Element>();
boolean foundkey = false;
for(int i = 0; i < elements.length; i++){
Element e = elements[i];
if(e.getName().compareTo("run_in_sidehead")==0 && (e.getTextTrim().startsWith("Key to ") || e.getTextTrim().matches("Group \\d+.*"))){
foundkey = true;
if(key!=null){
treatment.addContent((Element)key.clone());
}
key = new Element("TaxonKey");
}
if(!foundkey && (e.getName().compareTo("key")==0 || e.getName().compareTo("couplet")==0)){
foundkey = true;
if(key==null){
key = new Element("TaxonKey");
}
}
if(foundkey){
detacheds.add(e);
key.addContent((Element)e.clone());
}
}
if(key!=null){
treatment.addContent(key);
}
for(Element e: detacheds){
e.detach();
}
}
/**
* turn
* <references>SELECTED REFERENCES Behnke, H.-D., C. Chang, I. J. Eifert, and T. J. Mabry. 1974. Betalains and P-type sieve-tube plastids in Petiveria and Agdestis (Phytolaccaceae). Taxon 23: 541�542. Brown, G. K. and G. S. Varadarajan. 1985. Studies in Caryophyllales I: Re-evaluation of classification of Phytolaccaceae s.l. Syst. Bot. 10: 49�63. Heimerl, A. 1934. Phytolaccaceae. In: H. G. A. Engler et al., eds. 1924+. Die nat�rlichen Pflanzenfamilien�, ed. 2. 26+ vols. Leipzig and Berlin. Vol. 16c, pp. 135�164. Nowicke, J. W. 1968. Palynotaxonomic study of the Phytolaccaceae. Ann. Missouri Bot. Gard. 55: 294�364. Rogers, G. K. 1985. The genera of Phytolaccaceae in the southeastern United States. J. Arnold Arbor. 66: 1�37. Thieret, J. W. 1966b. Seeds of some United States Phytolaccaceae and Aizoaceae. Sida 2: 352�360. Walter, H. P. H. 1906. Die Diagramme der Phytolaccaceen. Leipzig. [Preprinted from Bot. Jahrb. Syst. 37(suppl.): 1�57.] Walter, H. P. H. 1909. Phytolaccaceae. In: H. G. A. Engler, ed. 1900�1953. Das Pflanzenreich�. 107 vols. Berlin. Vol. 39[IV,83], pp. 1�154. Wilson, P. 1932. Petiveriaceae. In: N. L. Britton et al., eds. 1905+. North American Flora�. 47+ vols. New York. Vol. 21, pp. 257�266.</references>
* to
* <references><reference>Behnke, H.-D., C. Chang, I. J. Eifert, and T. J. Mabry. 1974. Betalains and P-type sieve-tube plastids in Petiveria and Agdestis (Phytolaccaceae). Taxon 23: 541�542. </reference> <reference>...</reference>....</references>
* @param ref
* @return
*/
private void furtherMarkupReference(Element ref) {
//Element marked = new Element("references");
String text = ref.getText();
ref.setText("");
if(this.debugref) System.out.println("\nReferences text:"+text);
Pattern p = Pattern.compile("(.*?\\d+�\\d+\\.\\]?)(\\s+[A-Z]\\w+,.*)");
Matcher m = p.matcher(text);
while(m.matches()){
String refstring = m.group(1);
Element refitem = new Element("reference");
refitem.setText(refstring);
ref.addContent(refitem);
if(this.debugref) System.out.println("a ref:"+refstring);
text = m.group(2);
m = p.matcher(text);
}
Element refitem = new Element("reference");
refitem.setText("item:"+text);
ref.addContent(refitem);
if(this.debugref) System.out.println("a ref:"+text);
//ref.getParentElement().addContent(marked);
//ref.detach();
}
private String getChildText(Element pe, String string) throws Exception{
// TODO Auto-generated method stub
StringBuffer buffer=new StringBuffer();
List<Element> textList = XPath.selectNodes(pe, "./"+string);
for (Iterator <Element> ti = textList.iterator(); ti.hasNext();) {
Element wt = (Element) ti.next();
buffer.append(wt.getText()).append(" ");
}
return buffer.toString().replaceAll("\\s+", " ").trim();
}
private String parseTextTag(int textcount, String text, Element treatment, int filecount, String ptag){
String tag = "";
Pattern organpt = Pattern.compile("\\b("+this.organnamep+"|"+this.organnames+")\\b", Pattern.CASE_INSENSITIVE);
Matcher m = organpt.matcher(text);
int organcount = 0;
while(m.find()){
////System.out.println(m.group());
organcount++;
}
if(textcount ==1 && organcount >=2){
tag = "description";
addElement("description", text, treatment);
//outputDescriptionText(filecount, text); //hong: 08/04/09 take this function out. FOC descriptions are not part of TEXT.
}else if((textcount ==1 && organcount < 2)){
tag = "distribution";
//TODO: further markup distribution to: # of infrataxa, introduced, generalized distribution, flowering time,habitat, elevation, state distribution, global distribution
//addElement("distribution", text, treatment);
parseDistriTag(text, treatment);
}//else if(ptag.compareTo("distribution")==0){
else if(ptag.compareTo("description")==0){//hong: 3/11/10 for FNA v19
tag = "distribution";
//TODO: further markup distribution to: # of infrataxa, introduced, generalized distribution, flowering time,habitat, elevation, state distribution, global distribution
//addElement("distribution", text, treatment);
parseDistriTag(text, treatment);
}else if(ptag.compareTo("distribution")==0||ptag.compareTo("discussion")==0){
tag = "discussion";
addElement("discussion", text, treatment);
//System.out.println("discussion:"+text);
}
return tag;
}
/**
* further markup distribution to: (species-with infrataxa and higher)
* # of infrataxa, introduced, generalized distribution,
* or (species-without infrataxa and lower)
* flowering time,habitat, elevation, state distribution, global distribution
* @param text
* @param treatment
*/
private void parseDistriTag(String text, Element treatment){
//System.out.println("::::::::::::::::::::::::::::::::::\ndistribution: "+text);
Pattern rankp = Pattern.compile("^((?:Genera|Genus|Species|Subspecies|Varieties|Subgenera).*?:)\\s*(introduced\\s*;)?(.*)");
Matcher m = rankp.matcher(text);
if(m.matches()){//species and higher
if(m.group(1) != null){
addElement("number_of_infrataxa",m.group(1), treatment);
//System.out.println("number_of_infrataxa:"+m.group(1));
}
if(m.group(2)!=null){
addElement("introduced", m.group(2), treatment);
//System.out.println("introduced:"+m.group(2));
}
if(m.group(3) != null){
//addElement("general_distribution", m.group(3), treatment);
//further markkup distribution
DistributionParser4FNA dp = new DistributionParser4FNA(treatment, m.group(3), "general_distribution");
treatment = dp.parse();
//System.out.println("general_distribution:"+m.group(3));
}
}else{//species and lower
Pattern h = Pattern.compile("(Flowering.*?\\.)?(.*?(?:;|\\.$))?(\\s*of conservation concern\\s*(?:;|\\.$))?(.*?\\b(?:\\d+|m)\\b.*?(?:;|\\.$))?\\s*(introduced(?:;|\\.$))?(.*)");
Matcher mh = h.matcher(text);
if(mh.matches()){//TODO:habitat, elevation, state distribution, global distribution
if(mh.group(1) != null){
//addElement("flowering_time",mh.group(1), treatment);
//further markkup distribution
FloweringTimeParser4FNA dp = new FloweringTimeParser4FNA(treatment, mh.group(1), "flowering_time");
treatment = dp.parse();
//System.out.println("flowering_time:"+mh.group(1));
}
if(mh.group(2)!= null){
addElement("habitat",mh.group(2), treatment);
//System.out.println("habitat:"+mh.group(2));
}
if(mh.group(3)!= null){
addElement("conservation",mh.group(3), treatment);
//System.out.println("conservation:"+mh.group(3));
}
if(mh.group(4)!= null){
addElement("elevation",mh.group(4), treatment);
//System.out.println("elevation:"+mh.group(4));
}
if(mh.group(5)!= null){
addElement("introduced",mh.group(5), treatment);
//System.out.println("introduced:"+mh.group(5));
}
if(mh.group(6)!= null){
String[] distrs = mh.group(6).split(";");
for(int i= 0; i<distrs.length; i++){
if(distrs[i].matches(".*?\\b("+this.usstates+")(\\W|$).*")){
//addElement("us_distribution",distrs[i], treatment);
//further markkup distribution
DistributionParser4FNA dp = new DistributionParser4FNA(treatment, distrs[i], "us_distribution");
treatment = dp.parse();
//System.out.println("us_distribution:"+distrs[i]);
}else if(distrs[i].matches(".*?\\b("+this.caprovinces+")(\\W|$).*")){
//addElement("ca_distribution",distrs[i], treatment);
//further markkup distribution
DistributionParser4FNA dp = new DistributionParser4FNA(treatment, distrs[i], "ca_distribution");
treatment = dp.parse();
//System.out.println("ca_distribution:"+distrs[i]);
}else{
//addElement("global_distribution",distrs[i], treatment);
//further markkup distribution
DistributionParser4FNA dp = new DistributionParser4FNA(treatment, distrs[i], "global_distribution");
treatment = dp.parse();
//System.out.println("global_distribution:"+distrs[i]);
}
}
}
}else{
System.err.println("distribution not match: "+text);
}
}
}
private void parseSynTag(String tag, String text, Element treatment){
Element e = treatment.getChild("variety_name");
if(e != null){
tag = "synonym_of_variety_name";
}else if((e = treatment.getChild("subspecies_name"))!=null){
tag = "synonym_of_subspecies_name";
}else if((e = treatment.getChild("species_name"))!=null){
tag = "synonym_of_species_name";
}else if((e = treatment.getChild("tribe_name"))!=null){
tag = "synonym_of_tribe_name";
}else if((e = treatment.getChild("genus_name"))!=null){
tag = "synonym_of_genus_name";
}
addElement(tag, text, treatment);
//System.out.println(tag+":"+text);
}
private String parseNameTag(int index, String namerank, String line,
Element treatment) {
if(line == null || line.equals("")){
return ""; //TODO: should not happen. but did happen with v. 19 295.xml==>VolumeExtractor JDOM problem.
}
String name = ti.getName(index);
if(name==null ||name.compareTo("") == 0){
File xml = new File(Registry.TargetDirectory,
ApplicationUtilities.getProperty("TRANSFORMED") + System.getProperty("file.separator") + (index+1) + ".xml");
listener.info("no name found in: ", xml.getPath());
//errors.put((index+1)+"","no name found in: "+line);
return "";
}
// make a copy of the line and will work on the new copy
String text = new String(line);
text = text.replaceAll("�", " ").replaceAll("\\s+", " ").trim(); //there are some whitespaces that are not really a space, don't know what they are.
if(debug) System.out.println("\n"+(index+1)+": text="+text);
String number = null;
if(ti != null)
number = ti.getNumber(index);
else{
number = line.substring(0, line.indexOf('.'));
}
// number
addElement("number", number, treatment); // TODO: add the number tag
// to the sytle mapping
//text = text.substring(number.length() + 1); //Hong 08/04/09 change to
text = VolumeVerifier.fixBrokenNames(text);
text = text.replaceFirst("^.*?(?=[A-Z])", "").trim();;
//namerank and name
//(subfam|var|subgen|subg|subsp|ser|tribe|subsect)
if(namerank.indexOf("species_subspecies_variety_name")>=0){
if(text.indexOf("var.") >=0){
namerank = "variety_name";
}else if(text.indexOf("subsp.") >=0){
namerank = "subspecies_name";
}else if(text.indexOf("ser.") >=0){
namerank = "series_name";
}else if(text.indexOf("sect.") >=0){
namerank = "section_name";
}else if(text.indexOf("subsect.") >=0){
namerank = "subsection_name";
}else {
namerank = "species_name";
}
}
if(debug) System.out.println("namerank:"+namerank);
String[] nameinfo = getNameAuthority(name);
if(nameinfo[0]!=null && nameinfo[1]!=null){
addElement(namerank, nameinfo[0], treatment);
try {
vtDbA.add2TaxonTable(number, name, namerank, index+1);
} catch (ParsingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
}
if(debug) System.out.println("name:"+nameinfo[0]);
if(nameinfo[1].length()>0){
addElement("authority", nameinfo[1], treatment);
try {
vtDbA.add2AuthorTable(nameinfo[1]);
} catch (ParsingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
}
if(debug) System.out.println("authority:"+nameinfo[1]);
}
text = text.replaceFirst("^\\s*.{"+name.length()+"}","").trim();
}
//authority
/*Pattern p = Pattern.compile("(.*?)((?: in|,|·|\\?).*)");
Matcher m = p.matcher(text);
if(m.matches()){
if(m.group(1).trim().compareTo("")!= 0){
addElement("authority", m.group(1).trim(), treatment);
try {
vtDbA.add2AuthorTable(m.group(1).trim());
} catch (ParsingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
}
//System.out.println("authority:"+m.group(1).trim());
}
text = m.group(2).trim();
}*/
//save the segment after ?or ?for later
/*String ending = "";
int pos = text.lastIndexOf('.');
if(pos < 0){
pos = text.lastIndexOf('?');
}
if (pos != -1) {
ending = text.substring(pos + 1).trim();
text = text.substring(0, pos+1);
}*/
//derivation: deal with this first to remove [] and avoid pub-year match in []
Pattern p = Pattern.compile("(.*?)(\\[.*?\\]$)");
Matcher m = p.matcher(text);
if(m.matches()){
if(m.group(2).trim().compareTo("")!= 0){
addElement("etymology", m.group(2).trim(), treatment);
if(debug) System.out.println("etymology:"+m.group(2).trim());
}
text = m.group(1).trim();
}
//place of publication
//Pattern p = Pattern.compile("(.* [12]\\d\\d\\d|.*(?=·)|.*(?=.))(.*)"); //TODO: a better fix is needed Brittonia 28: 427, fig. 1. 1977 ? Yellow spinecape [For George Jones Goodman, 1904-1999
p = Pattern.compile("(.* [12]\\d\\d\\d)($|,|\\.| +)(.*)"); //TODO: a better fix is needed Brittonia 28: 427, fig. 1. 1977 ? Yellow spinecape [For George Jones Goodman, 1904-1999
m = p.matcher(text);
if(m.matches()){
String pp = m.group(1).replaceFirst("^\\s*[,\\.]", "").trim();
extractPublicationPlace(treatment, pp); //pp may be "Sp. Pl. 1: 480. 1753; Gen. Pl. ed. 5, 215. 1754"
text = m.group(3).trim();
}
// conserved
String conserved="name conserved";
int pos = text.indexOf(conserved);
if(pos < 0){
conserved="name proposed for conservation";
pos = text.indexOf(conserved);
}
if(pos < 0){
conserved="nom. cons.";
pos = text.indexOf(conserved);
}
if (pos != -1) {
//String conserved = text.substring(pos).trim();
text = text.replace(conserved, "").trim();
//conserved = conserved.replaceFirst("^\\s*[,;\\.]", "");
addElement("conserved", conserved, treatment);
if(debug) System.out.println("conserved:"+conserved);
// trim the text
//int p1 = text.lastIndexOf(',', pos);
//text = text.substring(0, p1);
}
//past_name
p = Pattern.compile("\\((?:as )?(.*?)\\)(.*)");
m = p.matcher(text);
if(m.matches()){
if(m.group(1).trim().compareTo("")!= 0){
addElement("past_name", m.group(1).trim(), treatment);
if(debug) System.out.println("past_name:"+m.group(1).trim());
}
text = m.group(2).trim();
}
//common name
p = Pattern.compile("(.*?)[��](.*?)(\\[.*|$)");
m = p.matcher(text);
if(m.matches()){
if(m.group(2).trim().compareTo("")!= 0){
String[] commonnames = m.group(2).trim().split("\\s*,\\s*");
for(String cname: commonnames){
addElement("common_name", cname, treatment);
if(debug) System.out.println("common_name:"+cname);
}
}
text = (m.group(1)+" "+m.group(3)).trim();
}
// format mark, common name, derivation
/*{
//int pos = text.lastIndexOf('?);
//if(pos < 0){
// pos = text.lastIndexOf('?);
//}
if (ending.compareTo("") != 0) {
//String ending = text.substring(pos + 1).trim();
String[] results = ending.split("\\[");
String commonName = results[0].trim();
addElement("common_name", commonName, treatment);
//System.out.println("common_name:"+commonName);
if (results.length > 1) {
String derivation = results[1].trim();
derivation = derivation.substring(0,
derivation.length() - 1); // remove the last ']'
addElement("derivation", derivation, treatment);
//System.out.println("derivation:"+derivation);
}
//text = text.substring(0, pos).trim();
}
}*/
if(text.trim().matches(".*?\\w+.*")){
if(debug) System.out.println((index+1)+"unparsed: "+text);
addElement("unparsed", text, treatment);
File xml = new File(Registry.TargetDirectory,
ApplicationUtilities.getProperty("TRANSFORMED") + System.getProperty("file.separator") + (index+1) + ".xml");
listener.info("unparsed: "+text, xml.getPath());
//errors.put((index+1)+"","still left: "+text);
}
return namerank.replace("_name", "");
}
/**
* family, genus, species has authority
* lower ranked taxon have authorities in names themselves
*
* Cactaceae Jussieu subfam. O puntioideae Burnett
* @param name
* @return
*/
private String[] getNameAuthority(String name) {
String[] nameinfo = new String[2];
if(name.matches(".*?\\b(subfam|var|subgen|subg|subsp|ser|tribe|sect|subsect)\\b.*")){
nameinfo[0] = name;
nameinfo[1] = "";
return nameinfo;
}
//family
Pattern p = Pattern.compile("^([a-z]*?ceae)(\\b.*)", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(name);
if(m.matches()){
nameinfo[0] = m.group(1).replaceAll("\\s", "").trim(); //in case an extra space is there
nameinfo[1] = m.group(2).trim();
return nameinfo;
}
//genus
p = Pattern.compile("^([A-Z][A-Z].*?)(\\b.*)");
m = p.matcher(name);
if(m.matches()){
nameinfo[0] = m.group(1).replaceAll("\\s", "").trim();
nameinfo[1] = m.group(2).trim();
return nameinfo;
}
//species
p = Pattern.compile("^([A-Z].*?)\\s+([(A-Z].*)");
m = p.matcher(name);
if(m.matches()){
nameinfo[0] = m.group(1).trim();
nameinfo[1] = m.group(2).trim();
return nameinfo;
}
return nameinfo;
}
private void extractPublicationPlace(Element treatment, String pp) {
pp = pp.replaceFirst("^\\s*,", "").trim();
String pub="";
String pip="";
String[] pps = pp.split(";");
for(String apub: pps){
String place_in_publication="(.*?)(\\d.*?)";
Matcher pubm=Pattern.compile(place_in_publication).matcher(apub);
if(pubm.matches()){
pub=pubm.group(1).trim();
pip=pubm.group(2).trim();
}
Element placeOfPub=new Element("place_of_publication");
addElement("publication_title",pub,placeOfPub);
addElement("place_in_publication",pip,placeOfPub);
treatment.addContent(placeOfPub);
if(debug) System.out.println("publication_title:"+pub);
if(debug) System.out.println("place_in_publication:"+pip);
try {
vtDbA.add2PublicationTable(pub);
} catch (ParsingException e) {
e.printStackTrace();
LOGGER.error("Couldn't perform parsing in VolumeTransformer:parseNameTag", e);
} catch (SQLException e) {
e.printStackTrace();
LOGGER.error("Database access error in VolumeTransformer:parseNameTag", e);
}
}
}
private static void addElement(String tag, String text, Element parent) {
Element e = new Element(tag);
e.setText(text);
parent.addContent(e);
}
private void outputElementText(int count, String text, String elementname) throws ParsingException {
//System.out.println("write file "+count+".txt");
//elementname = "DESCRIPTIONS"
try {
File file = new File(Registry.TargetDirectory,
ApplicationUtilities.getProperty(elementname) + System.getProperty("file.separator")+ count + ".txt");
BufferedWriter out = new BufferedWriter(new FileWriter(file));
out.write(text);
out.close(); // don't forget to close the output stream!!!
} catch (IOException e) {
e.printStackTrace();
LOGGER.error("Failed to output text file in VolumeTransformer:outputDescriptionText", e);
throw new ParsingException("Failed to output text file.", e);
}
}
}