package fna.parsing; import java.io.BufferedOutputStream; import java.io.FileOutputStream; import java.io.IOException; import java.util.Iterator; import java.util.List; import org.apache.log4j.Logger; import org.jdom.Attribute; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; /** * Hong 08/04/09 revised for FoC volumes a) add start, names, * tribegenusnamestyle private properties. b) if(treatment.indexOf(new * Element("text"))>=0){ =>added this condition to filter out empty files. Hong * 10/7/08: a) record "smallcaps" for genus/tribe names this is necessary when a * taxonlist is not provided with "smallcaps" info in the extracted records, * VolumeVerifier can build a taxon index for VolumeTransformer. b) also keep * the original delimiters in names: may be useful for VolumeVerifier. * * Chunshui summer 08: To extract the data from the docx file. * * The functions include: 1, (TODO)extract the document.xml from the docx file. * 2, parse the document.xml 3, output individual treatment in an intermediate * xml file. * * Only the paragraphs enclosed in the style listed in style-mapping.properties * file will be kept. * * And save the data to an XML format listing style and text pair for each * paragraph. * * The output will be processed further by VolumeVerifier.java * * @author chunshui */ @SuppressWarnings({ "unchecked" }) public class VolumeExtractor extends Thread { protected String source; // private MainForm mainForm; protected static final Logger LOGGER = Logger .getLogger(VolumeExtractor.class); protected String target; protected ProcessListener listener; protected int count; protected Element treatment; protected XMLOutputter outputter; // private String start = "Name"; //TODO: include the following in the // configuration file: style names indicating the start of a new treatment // private String syn = "Syn"; // private String tribegennamestyle = "smallCaps"; protected static String start = ".*?(Heading|Name).*"; // starts a treatment // public static String start = ""; //starts a treatment protected String names = ".*?(Syn|Name).*"; // other interesting names worth parsing protected String key = ".*?-Key.*"; public String tribegennamestyle = "caps"; protected static String ignorednames = "incertae sedis"; private boolean debug = false; private boolean keydebug = true; public VolumeExtractor(String source, String target, ProcessListener listener) { this.source = source; this.target = target; this.listener = listener; Registry.TribeGenusNameCase = tribegennamestyle; Registry.NomenclatureStylePtn = start; Registry.SynonymStylePtn = names; } /** * Extract the data from the source file * * TODO: unzip the document.xml from the docx file */ public void run() { listener.setProgressBarVisible(true); extract(); listener.setProgressBarVisible(false); } public void extract() throws ParsingException { try { listener.progress(1); // init the outputter outputter = new XMLOutputter(Format.getPrettyFormat()); // build the root element from the xml file SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(source + "document.xml"); if(debug) System.out.println(source + "document.xml"); Element root = doc.getRootElement(); // find all <w:p> tags List<Element> wpList = XPath.selectNodes(root, "/w:document/w:body/w:p"); // iterate over the <w:p> tags count = 1; int total = wpList.size(); for (Iterator<Element> iter = wpList.iterator(); iter.hasNext();) { // Element test = (Element)iter.next(); // System.out.println(test.getName());//new added processParagraph((Element) iter.next()); listener.progress((count * 100) / total); // output(); } // output the last file output(); } catch (Exception e) { LOGGER.error( "Unable to parse/ extract the file in VolumeExtractor:extract", e); e.printStackTrace(); throw new ParsingException(e); } } /** * To process a w:p tag * * output style:text pairs for each paragraph * * @param wp * @throws JDOMException */ private void processParagraph(Element wp) throws Exception { // read the paragraph style Attribute att = (Attribute) XPath.selectSingleNode(wp, "./w:pPr/w:pStyle/@w:val");// XXX change from @w:val to w:val if (att == null) {// TODO: issue a warning if(debug) System.out.println("============================================>null"); return; } String style = att.getValue(); if(debug) System.out.println(style); // check if a name paragraph reached, assuming a treatment starts with a // Name paragraph // if (style.indexOf("Name") >= 0) { if (style.matches(start)) {// start = ".*?(Heading|Name).*" // The code reaches to a name paragraph // output the current treatment file // if (treatment != null) { if (treatment != null) { if (treatment.getChild("paragraph") != null) { if (treatment.getChild("paragraph").getChild("text") != null && !treatment.getChild("paragraph") .getChild("text").getTextTrim() .matches(".*?" + ignorednames + ".*") && treatment.getChildren("paragraph").size() >= 2) { // must contain style and text, must contain >=2 paragraphs /* * It is not possible for a treatment to just have a * name Heading4 /Taxa incertae sedis from FoC v22, taxa * whose placement is uncertain */ output(); // ready to write this treatment out count++; } } else { output(); // ready to write this treatment out count++; } } // logger.info("processing: " + count); // create a new output file treatment = new Element("treatment"); } populateTreatment(wp, style); } /*protected void createTreatment() { treatment = new Element("treatment"); }*/ protected void populateTreatment(Element wp, String style) throws JDOMException { Element se = new Element("style"); se.setText(style); Element pe = new Element("paragraph"); pe.addContent(se); if (style.matches(start) || style.matches(names)) { extractNameParagraph(wp, pe); }else if(style.matches(key)){ extractKeyParagraph(wp, pe); //try to separate a key "statement" from "determination" }else { extractTextParagraph(wp, pe); } // add the element to the treatment (root) element treatment.addContent(pe); } /** * wp containing the text, to be formated as "statement # determination", then add to pe * @param wp * @param pe */ private void extractKeyParagraph(Element wp, Element pe) throws JDOMException{ StringBuffer formatted = new StringBuffer(); List<Element> text = XPath.selectNodes(wp, "./w:r/w:tab"); Iterator<Element> it = text.iterator(); while(it.hasNext()){ Element t = it.next(); t.setText("###"); t.setName("t"); } text = XPath.selectNodes(wp, "./w:r/w:t"); it = text.iterator(); while(it.hasNext()){ Element t = it.next(); formatted.append(t.getTextTrim()+" "); } /* List<Element> text = XPath.selectNodes(wp, "./w:r/w:t"); Iterator<Element> it = text.iterator(); while(it.hasNext()){ Element t = it.next(); if(t.getAttribute("space", Namespace.XML_NAMESPACE) != null && t.getAttributeValue("space", Namespace.XML_NAMESPACE).compareTo("preserve")==0){ String temp = t.getTextTrim(); if(temp.length()>0) formatted.append(" ### ").append(temp+" "); }else{ formatted.append(t.getTextTrim()+" "); } } */ Element te = new Element("text"); String t = formatted.toString().trim(); te.setText(t); pe.addContent(te); if(keydebug) System.out.println(t); } private void extractNameParagraph(Element wp, Element pe) throws JDOMException { String acase = ""; List<Element> rList = XPath.selectNodes(wp, "./w:r"); for (Iterator <Element>ti = rList.iterator(); ti.hasNext();) { Element re = (Element) ti.next(); // find smallCaps Element rpr = (Element) XPath.selectSingleNode(re, "./w:rPr"); // Genus, // Tribe // names // are // in // smallCaps if (rpr != null && XPath.selectSingleNode(rpr, "./w:" + tribegennamestyle) != null) { acase = tribegennamestyle; } else { acase = ""; } // collect text StringBuffer buffer = new StringBuffer(); List<Element> textList = XPath.selectNodes(re, "./w:t"); for (Iterator<Element> it = textList.iterator(); it.hasNext();) { Element wt = (Element) it.next(); String tmp = wt.getText(); buffer.append(tmp).append(" "); } // } String text = buffer.toString().replaceAll("\\s+", " ").trim(); ; // build the elements Element te = null; if (text.matches(".*?\\S.*")) { // not an empty string or a // number of spaces te = new Element("text"); te.setText(text); } if(debug) System.out.println("Name: " + acase + " : " + text); Attribute ca = null; if (!acase.equals("") && te != null) { ca = new Attribute("case", tribegennamestyle); te.setAttribute(ca); } if (te != null) pe.addContent(te); } } private void extractTextParagraph(Element wp, Element pe) throws JDOMException { StringBuffer buffer = new StringBuffer(); List<Element> textList = XPath.selectNodes(wp, "./w:r/w:t"); for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) { Element wt = (Element) ti.next(); buffer.append(wt.getText()).append("#"); } String text = buffer.toString().replaceAll("-#", "-") .replaceAll("#", "").replaceAll("\\s+", " ").trim(); /* * buffer.append(wt.getText()).append("-"); } String text = * buffer.toString().replaceAll("\\s+", " ").trim(); */ Element te = new Element("text"); te.setText(text); pe.addContent(te); } /** * To output the <treatment> element * * @throws IOException */ private void output() throws ParsingException { try { String file = target + "extracted\\" + count + ".xml"; Document doc = new Document(treatment); BufferedOutputStream out = new BufferedOutputStream( new FileOutputStream(file)); /* Producer */ outputter.output(doc, out); /* Consumer */ listener.info(count + "", file); } catch (IOException e) { LOGGER.error("Exception in VolumeExtractor : output", e); throw new ParsingException(e); } } public static String getStart() { return start; } public static void setStart(String start) { VolumeExtractor.start = start; } }