package fna.parsing;
import java.io.File;
import java.io.FileInputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;
/**
* Hong. 08/04/09
* a) add ./text[@case=""] and fixBrokenNames
* b) TODO: output a report of potential errors for the user to check manually.
* To verify the volume document format.
*
* Style verify:
*
* Number verify:
*
* Name verify:
*
* @author chunshui
*/
@SuppressWarnings({ "unchecked", "unused" })
public class VolumeVerifier extends Thread {
private String target;
private String conf;
private ProcessListener listener;
private String path;
private int total;
private File[] extracted;
private TaxonIndexer ti;
private String namelist = "|";
private static final Logger LOGGER = Logger.getLogger(VolumeVerifier.class);
public VolumeVerifier(ProcessListener listener) {
this.listener = listener;
target = Registry.TargetDirectory+System.getProperty("file.separator");
conf = Registry.ConfigurationDirectory;
path = target + "extracted";
}
public void run () {
listener.setProgressBarVisible(true);
verify();
listener.setProgressBarVisible(false);
}
public void verify() {
// get the extracted files list
listener.progress(1);
File directory = new File(path);
extracted = directory.listFiles();
total = extracted.length;
listener.info("To verify files: " + extracted.length);
listener.progress(10);
// init the taxon index
ti = new TaxonIndexer(conf); // TODO: add the
// taxon index to
// conf
ti.build();
listener.info("Taxon index initialized.");
listener.progress(30);
// verify the files
listener.info("To verify the files");
if (!verifyFile()) {
listener.info("File verify failure!");
return;
} else {
listener.info("File verify success!");
}
listener.progress(50);
// verify the style
listener.info("To verify the style");
if (!verifyStyle()) {
listener.info("Style verify failure!");
return;
} else {
listener.info("Style verify success!");
}
listener.progress(70);
// verify the number
listener.info("To verify the number");
if (!verifyNumber()) {
listener.info("Number verify failure!");
return;
} else {
listener.info("Number verify success!");
}
listener.progress(90);
// verify the name
listener.info("To verify the name");
if (!verifyName()) {
listener.info("Name verify failure!");
return;
} else {
listener.info("Name verify success!");
}
listener.info("Volume format verify success!");
// write the updated TaxonIndexer
TaxonIndexer.saveUpdated(conf, ti);
listener.info("Update the TaxonIndexer success!");
listener.progress(99);
}
/**
* To verify the extracted files validity.
*
* @return passed
*/
private boolean verifyFile() {
boolean passed = true;
for (int i = 1; i <= total; i++) {
File file = new File(path, i + ".xml");
if (!file.exists()) {
listener.info("", file.getName(), "File does not exist!");
passed = false;
} else if (!file.isFile()) {
listener.info("", file.getName(), "File is not a file!");
passed = false;
}
}
return passed;
}
/**
* To verify the style of the document.
*
* @return passed
*/
private boolean verifyStyle() {
boolean passed = true;
try {
// load style mapping
Properties props = new Properties();
// read in the translation properties;
props.load(new FileInputStream(conf + System.getProperty("file.separator")+"style-mapping.properties"));
for (int i = 1; i <= total; i++) {
File file = new File(path, i + ".xml");
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
// find all <style> tags
List<Element> styleList = XPath.selectNodes(root,
"/treatment/paragraph/style");
// iterate over the <style> tags
for (Iterator<Element> iter = styleList.iterator(); iter.hasNext();) {
Element se = (Element) iter.next();
String style = se.getText();
String mapping = props.getProperty(style);
// verify if the style has a mapping
if (mapping == null || mapping.length() == 0) {
listener.info("", file.getName(), "Invalid style "
+ style);
passed = false;
}
}
}
} catch (Exception e) {
LOGGER.error("VolumeVerifier : verifyStyle - Exception ", e);
throw new ParsingException(e);
}
return passed;
}
/**
* To verify the taxon number according to the taxon index.
*
* If taxon index is not built yet, build it here.
*
* @return passed True if the verify passed
*/
private boolean verifyNumber() {
boolean passed = true;
if(ti.emptyNumbers()){
listener.info("no taxon list number to check against, build taxon numbers. ");
fillInNumbers();
listener.info("check passed");
return passed;
}
try {
for (int i = 1; i <= total; i++) {
File file = new File(path, i + ".xml");
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
Element te = (Element) XPath.selectSingleNode(root,
"/treatment/paragraph/text");
String text = te.getText();
String number = ti.getNumber(i - 1);
if (!text.startsWith(number)) {
// hong 6/26/08: make 12c1 and 12c.1 match
// extract number 12c.1 from text and save it in ti
Pattern p = Pattern.compile("(.*?[a-z])(\\d+)");
Matcher m = p.matcher(number);
boolean check = false;
if (m.matches()) {
String pt = m.group(1) + "\\.?" + m.group(2);
p = Pattern.compile(pt);
m = p.matcher(text);
if (m.find()) {
ti.addNumber(text.substring(m.start(), m.end())); //add one by one in sequence
check = true;
}
}
if (!check) {
listener.info("", file.getName(),
"Invalid number! Expected: " + number);
passed = false;
}
}
}
} catch (Exception e) {
LOGGER.error("VolumeVerifier : verifyNumber - Exception ", e);
e.printStackTrace();
throw new ParsingException(e);
}
return passed;
}
private void fillInNumbers(){
try {
for (int i = 1; i <= total; i++) {
File file = new File(path, i + ".xml");
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
Element te = (Element) XPath.selectSingleNode(root,
"/treatment/paragraph/text");
String text = "";
if(te != null){//TODO: te should not be null, but for 295.xml JDOM screw-up
text = te.getText();
text = text.replaceAll("\\s[a-zA-Z].*", ""); //1. Amstersdfds 12.c Ames
}
if(text.matches("^\\d.*")){
ti.addNumber(text); //add one by one in sequence
System.out.println("add numbers :"+text+" for file "+i+".xml");
}else{
ti.addNumber("0"); //add one by one in sequence
System.out.println("add numbers :"+0+" for file "+i+".xml");
}
}
} catch (Exception e) {
LOGGER.error("VolumeVerifier : fillInNumbers - Exception ", e);
e.printStackTrace();
throw new ParsingException(e);
}
}
/**
* To verify the taxon name according to the taxon index
*
* If taxon index is not built yet, build it here.
*
* TODO: verify the var name
*
* @return pass True if the verify passed
*/
private boolean verifyName() {
boolean passed = true;
if(ti.emptyNames()){
listener.info("no taxon list name to check against, build taxon names. ");
fillInNames();
listener.info("check passed");
return passed;
}
try {
for (int i = 1; i <= total; i++) {
File file = new File(path, i + ".xml");
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
String number = ti.getNumber(i - 1);
String name = ti.getName(i - 1);
//Element te = (Element) XPath.selectSingleNode(root, "/treatment/paragraph/text");
Element pe = (Element) XPath.selectSingleNode(root, "/treatment/paragraph");
String extractedname = extractName(pe, i + ".xml");
//boolean check = (extractedname == null) || (name.indexOf(extractedname) >= 0 || extractedname.indexOf(name) >= 0);
if (! match(name, extractedname)) {//HongCui
passed = false;
listener.info("", file.getName(),
"Invalid name. Expected: " + number + "." + name); // TODO: if the logic is right after merged with hong's code?
}
}
} catch (Exception e) {
e.printStackTrace();
LOGGER.error("VolumeVerifier : verifyName - Exception ", e);
throw new ParsingException(e);//HongCui
}
return passed;
}
//if all words in name are in answer, then return true
private boolean match (String answer, String name){
if(name == null){
return false;
}
name = name.toLowerCase();
name = name.replaceAll("\\W", " ");
name = name.trim();
answer = answer.toLowerCase();
answer = answer.replaceAll("\\W", " ");
answer = answer.trim();
String [] answerparts = name.split("\\s+");
for(int i = 0; i<answerparts.length; i++){
if(name.indexOf(answerparts[i]) < 0){
return false;
}
}
return true;
}
/**
* extract a name from the name paragraph element
* a name includes its authority
* @param pe
* @return
*/
private String extractName(Element pe, String filename) {//TODO: the case of "�" in v. 19, 516.xml "Agoseris �elata"
try{
//concat <text> elements into one string
StringBuffer buffer=new StringBuffer();
List<Element> textList = XPath.selectNodes(pe, "./text");
List<Element> additionalList = XPath.selectNodes(pe, "./text[@case='"+Registry.TribeGenusNameCase+"']");
textList.addAll(additionalList);
for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) {
Element wt = (Element) ti.next();
buffer.append(wt.getText()).append(" ");
}
String text = buffer.toString().replaceAll("\\s+", " ").trim();
text = text.replaceFirst("^.*?(?=[A-Z])", "").trim();
//fix broken names: T HYRSOSTACHYS; va r. subhispida
text = fixBrokenNames(text);
text = chunkPlaceOfPub(text, filename); //after this, text should only hold name information
if(text.length() == 0){return "";} //TODO: shouldn't happen, except for 295.xml
// hong 6/26/08: make 1.Pterostegia drymarioides
// and 1. Pterostegia drymarioides Fischer ... match.
// Extract full name string and save it in ti.
//tribe: didn't need smallCaps info.
Pattern p = Pattern
.compile("\\b(?:subfam|var|subgen|subg|subsp|ser|tribe|subsect)[\\.\\s]+([-a-z]+)", Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(text);
Pattern p1 = Pattern
.compile("\\bsect[\\.\\s]+([-a-zA-Z]+)"); //Journal names may contain e.g. Sect. IV Hong 08/04/09
Matcher m1 = p1.matcher(text);
boolean check = false;
if (m.find() || m1.find()) {// sub- names
//Pattern p4 = Pattern.compile("(.*?\\s+|^)(\\D.*)");
//Matcher m4 = p4.matcher(text);
//if(m4.matches()){
//String aname = m4.group(2);
String aname = text;
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [sub rank]");
return aname;
//System.out.println("::::::::::duplicate "+aname+" [sub rank]");//should not occur
}else{
namelist += laname+"|";
return aname;
}
// }
/*Pattern p2 = Pattern.compile("\\.\\s*(\\D*?\\bsubg[\\.\\s]+[-a-z]+[\\s\\w\\.]*\\)[\\s\\w]*\\.\\s?[A-Z]+\\s*\\w+)",
Pattern.CASE_INSENSITIVE);
Matcher m2 = p2.matcher(text);
if (m2.find()) {
String aname = m2.group(1);
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [sub rank]");
return aname;
//System.out.println("::::::::::duplicate "+aname+" [sub rank]");//should not occur
}else{
namelist += laname+"|";
return aname;
}
}
Pattern p3 = Pattern.compile("\\.\\s*(\\D*?\\bsect[\\.\\s]+[-a-z]+[\\s\\w\\.]*\\)[\\s\\w]*\\.\\s?[A-Z]+\\s*\\w+)",
Pattern.CASE_INSENSITIVE);
Matcher m3 = p3.matcher(text);
if (m3.find()) {
String aname = m3.group(1);
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [sub rank]");
return aname;
//System.out.println("::::::::::duplicate "+aname+" [sub rank]");//should not occur
}else{
namelist += laname+"|";
return aname;
}
}
p = Pattern.compile("\\.\\s*(\\D*?\\b(?:subfam|var|subgen|subsp|ser|tribe)[\\.\\s]+[-a-z]+)",
Pattern.CASE_INSENSITIVE); //Hong 08/04/09
m = p.matcher(text);
if (m.find()) {
String aname = m.group(1);
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [sub rank]");
return aname;
//System.out.println("::::::::::duplicate "+aname+" [sub rank]");//should not occur
}else{
namelist += laname+"|";
return aname;
}
}*/
} else {// family, genus, species names
/*
* FNA v. 5 and 19
*List<Element> textList2 = XPath.selectNodes(pe, "./text");
int n = nameLine(textList2);
String namestring = textList2.get(n).getText(); //2nd <text> holds the name
//exception: <text>1. Artemisia aleutica</text>
namestring = namestring.replaceAll("^\\d.*?\\s", "");
Pattern famname = Pattern.compile("\\b([a-z]*?ceae)\\b.*", Pattern.CASE_INSENSITIVE);
Pattern genname = Pattern.compile("^([A-Z][A-Z].*?)\\b.*"); //NOTHOCALAIS with two dots on top of last I
m = famname.matcher(namestring);*/
String namestring = text;
namestring = namestring.replaceAll("^\\d.*?\\s+", "");
namestring = namestring.replaceAll("^\\.\\s+", ""); // if there is a . Hong 08/04/09 e.g "4 . XXXX"
Pattern famname = Pattern.compile("^([a-z]*?ceae)\\b.*", Pattern.CASE_INSENSITIVE);
Pattern genname = Pattern.compile("^([A-Z][A-Z].*?)\\b.*"); //NOTHOCALAIS with two dots on top of last I
m = famname.matcher(namestring);
if(m.find()){
//String aname = m.group(1);
String aname = text;
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [family rank]");
//System.out.println("::::::::::duplicate "+aname+" [family rank]"); //should not occur
}else{
namelist += laname+"|";
return aname; //family
}
}else{
m = genname.matcher(namestring);
if(m.find()){
//String aname = m.group(1);
String aname = text;
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [genus rank]");
//System.out.println("::::::::::duplicate "+aname+" [genus rank]"); //should not occur
}else{
namelist += laname+"|";
return aname; //genus
}
}
else{
//String aname = namestring;//species name
//Hong: 08/04/09 take the first two words as species name
//String[] tokens = namestring.trim().split("\\s+");
//if(tokens.length >= 2){
//String aname = tokens[0]+" "+tokens[1];
String aname = text;
String laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
listener.info("", filename, "Repeated taxon name:"+aname+" [species rank]");
//System.out.println("::::::::::duplicate "+aname+" [species rank]"); //could occur
/*String nextseg = textList2.get(n+1).getText();
String firstword = nextseg.indexOf(" ")>0? nextseg.substring(0,nextseg.indexOf(" ")) : nextseg;
aname += " "+firstword;
laname = aname.toLowerCase();
if(namelist.indexOf("|"+laname+"|")>=0){
System.out.println("::::::::::2nd try: duplicate "+aname+" [species rank]"); //should not occur
}else{
namelist += laname+"|";
return aname;
} Hong 08/04/09*/
}else{
namelist += laname+"|";
return aname;
}
//}else{
// System.out.println(namestring+" is not at [species rank]");
//}
}
}
/*String fgsname = name.replaceFirst("\\bfam\\b", "").trim();
if (text.toLowerCase().replaceFirst(
"^" + number + "\\s*\\.\\s*", "").startsWith(
fgsname.toLowerCase())) {
check = true;
ti.setName(i - 1, fgsname);
}*/
}
}catch(Exception e){
LOGGER.error("VolumeVerifier : extractName - Exception ", e);
e.printStackTrace();
}
return null;
}
/**
*
* @param text: 14c. Mirabilis linearis (Pursh) Heimerl var. subhispida (Heimerl) Spellenberg, Novon 12: 270. 2002
* @return: 14c. Mirabilis linearis (Pursh) Heimerl var. subhispida (Heimerl) Spellenberg
*/
private String chunkPlaceOfPub(String text, String filename) {
//search for the comma before a number
//Pattern p = Pattern.compile("(.*?[A-Z].*?),([^,]+?)\\d.*");
String textcp = text;
String journal = null;
Pattern p = Pattern.compile("(.*?[A-Z].*),\\s+([^\\d]+)\\s+\\d.*"); //(Rydberg) Munz, Man. S. Calif. Bot., 598. 1935
Matcher m = null;
//problems 489.xml 6a. Dysphania R. Brown sect. Adenois (Moquin-Tandon) Mosyakin & Clemants, Ukrayins�k. Bot. Zhurn., n. s. 59: 382. 2002
while(text.matches(".*?[A-Z].*?\\d+.*")){//4a. Echinocereus pectinatus (Scheidweiler) Engelmann var. wenigeri L. D. Benson, Cact. Succ. J. (Los Angeles) 40: 124, <=================>fig. 3. 1968 � Weniger�shedgehog,ashy white ra inbow cactus, langtry rainbow cactus
m = p.matcher(text);
if(m.matches()){
text = m.group(1).trim();
journal = m.group(2).replaceFirst(",\\s*$", "").trim();
}else{
break;
}
}
//post process to deal with some special cases:
//Boerhavia line arifolia A. Gray, Amer. J. Sci. Arts
int in = text.indexOf(","); //suspecious
if(in > 0){
p = Pattern.compile(",[^\\d]+&"); //may be authority list: a, b & c
m = p.matcher(text);
if(!m.find()){
p = Pattern.compile("\\([^()]*?,");//may be in ()
m = p.matcher(text);
if(!m.find()){// deal with this case
text = text.substring(0, in).trim();
}
}
}
//in Smith ed.
in = text.indexOf(" in ");
if(in > 0){
text = text.substring(0, in).trim();
//String rest = textcp.substring(in).trim();
}
//PHYTOLACCACEAE R. Brown � Pokeweed Family
in = text.indexOf("�");
if(in < 0){
in = text.indexOf("�");
}
if(in > 0){
text = text.substring(0, in).trim();
//String rest = textcp.substring(in).trim();
}
if(!isJournal(journal)){
listener.info("", filename, "Check taxon name:"+text);
}
return text;//text now holds only the name, which should not contain a number
}
/**
* check journal list
* @param journal
* @return
*/
private boolean isJournal(String journal) {
/*Connection conn = null;
try{
if(conn == null){
Class.forName("com.mysql.jdbc.Driver");
String URL = "jdbc:mysql://localhost/journals?user=root&password=root";
//String URL = ApplicationUtilities.getProperty("database.url");
conn = DriverManager.getConnection(URL);
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery("select * from journals where abbreviation like'"+journal+" %'");
if(rs.next()){
return true;
}
stmt.close();
}
}catch(Exception e){
e.printStackTrace();
}*/
return false;
}
public static String fixBrokenNames(String text){
Pattern p = Pattern.compile("(.*?(?:^| ))([A-Z] )(\\w.*)");
Matcher m = p.matcher(text);
if(m.matches()){
text = m.group(1)+m.group(2).trim()+m.group(3);
}
//T HYRSOSTACHYS
p = Pattern.compile("(.*?(?:^| ))([A-Z]+ )([A-Z][A-Z].*)");
m = p.matcher(text);
if(m.matches()){
text = m.group(1)+m.group(2).trim()+m.group(3);
}
p = Pattern.compile("(.*?(?:^| ))(\\d+ )(\\d+.*)");//HOng 08/04/09 "3 9 . xxxx"
m = p.matcher(text);
if(m.matches()){
text = m.group(1)+m.group(2).trim()+m.group(3);
}
//va r.
//make sure any of these are not broken: subfam|var|subgen|subg|subsp|ser|tribe
p = Pattern.compile("(.*?)\\b(s ?u ?b ?f ?a ?m|v ?a ?r|s ?u ?b ?g ?e ?n|s ?u ?b ?g|s ?u ?b ?s ?p|s ?e ?r|t ?r ?i ?b ?e)\\b(.*)");
m = p.matcher(text);
if(m.matches()){
text = m.group(1)+m.group(2).replaceAll("\\s+", "")+m.group(3);
}
return text;
}
private int nameLine(List<Element> textList2) {
// TODO Auto-generated method stub
int size = textList2.size();
for(int i = 0; i < size; i++){
String l = textList2.get(i).getText().trim();
//exception: <text>1. Artemisia aleutica</text>
if(l.length() > 4){return i;}
if(!l.matches("^\\d.*") && !l.matches("\\W+")){ //the name line does not start with a number, nor does it contain only non=word characters
return i;
}
}
return 0;
}
/**
* 42000077#F=43#Caryophyllaceae[fam] #
50260245#SF=43a#Polycarpoideae[subfam] #
40035931#G=1#Drymaria[genus]#
06301172#1#Drymaria cordata[species] #F3 I NM W
50266761#1a#Drymaria cordata var cordata[variety]#F3 I W
*
*
* populate taxon index with names
* then concate all text in Name paragraph in one <text> element.
*/
private void fillInNames(){
try {
for (int i = 1; i <= total; i++) {
File file = new File(path, i + ".xml");
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
Element pe = (Element) XPath.selectSingleNode(root,
"/treatment/paragraph"); //get the first paragraph, which is the name paragraph
String taxonname = extractName(pe, i + ".xml");
ti.addName(taxonname);//add one by one in sequence
System.out.println("add name :"+taxonname+ " for file "+i+".xml");
}
} catch (Exception e) {
LOGGER.error("VolumeVerifier : fillInNames - Exception ", e);
e.printStackTrace();
throw new ParsingException(e);
}
}
}