VolumeExtractor.java example

Explorer
phenoscape-nlp-master
- parsing-gui
  - lib
    - elk-distribution-0.3.2-owlapi-library
      - examples
        org
        semanticweb
        elk
        owlapi
        examples
        QueryingUnnamedClassExpressions.java
        QueryingWithNamedClasses.java
        RetrievingInstances.java
        SavingInferredAxioms.java
  - src
    - com
      - swtdesigner
        SWTResourceManager.java
    - fna
- phenoscapeII
  - src
package fna.parsing;

import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;

import org.apache.log4j.Logger;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
import org.jdom.xpath.XPath;

/**
 * Hong 08/04/09 revised for FoC volumes a) add start, names,
 * tribegenusnamestyle private properties. b) if(treatment.indexOf(new
 * Element("text"))>=0){ =>added this condition to filter out empty files. Hong
 * 10/7/08: a) record "smallcaps" for genus/tribe names this is necessary when a
 * taxonlist is not provided with "smallcaps" info in the extracted records,
 * VolumeVerifier can build a taxon index for VolumeTransformer. b) also keep
 * the original delimiters in names: may be useful for VolumeVerifier.
 * 
 * Chunshui summer 08: To extract the data from the docx file.
 * 
 * The functions include: 1, (TODO)extract the document.xml from the docx file.
 * 2, parse the document.xml 3, output individual treatment in an intermediate
 * xml file.
 * 
 * Only the paragraphs enclosed in the style listed in style-mapping.properties
 * file will be kept.
 * 
 * And save the data to an XML format listing style and text pair for each
 * paragraph.
 * 
 * The output will be processed further by VolumeVerifier.java
 * 
 * @author chunshui
 */

@SuppressWarnings({ "unchecked" })
public class VolumeExtractor extends Thread {

	protected String source;
	// private MainForm mainForm;
	protected static final Logger LOGGER = Logger
			.getLogger(VolumeExtractor.class);

	protected String target;

	protected ProcessListener listener;

	protected int count;

	protected Element treatment;

	protected XMLOutputter outputter;

	// private String start = "Name"; //TODO: include the following in the
	// configuration file: style names indicating the start of a new treatment
	// private String syn = "Syn";
	// private String tribegennamestyle = "smallCaps";
	protected static String start = ".*?(Heading|Name).*"; // starts a treatment
	// public static String start = ""; //starts a treatment
	protected String names = ".*?(Syn|Name).*"; // other interesting names worth parsing
	protected String key = ".*?-Key.*";											
	public String tribegennamestyle = "caps";
	protected static String ignorednames = "incertae sedis";
	private boolean debug = false;
	private boolean keydebug = true;

	public VolumeExtractor(String source, String target,
			ProcessListener listener) {
		this.source = source;
		this.target = target;
		this.listener = listener;
		Registry.TribeGenusNameCase = tribegennamestyle;
		Registry.NomenclatureStylePtn = start;
		Registry.SynonymStylePtn = names;
	}

	/**
	 * Extract the data from the source file
	 * 
	 * TODO: unzip the document.xml from the docx file
	 */

	public void run() {
		listener.setProgressBarVisible(true);
		extract();
		listener.setProgressBarVisible(false);
	}

	public void extract() throws ParsingException {
		try {
			listener.progress(1);
			// init the outputter
			outputter = new XMLOutputter(Format.getPrettyFormat());

			// build the root element from the xml file
			SAXBuilder builder = new SAXBuilder();
			Document doc = builder.build(source + "document.xml");
			if(debug) System.out.println(source + "document.xml");
			Element root = doc.getRootElement();

			// find all <w:p> tags
			List<Element> wpList = XPath.selectNodes(root, "/w:document/w:body/w:p");

			// iterate over the <w:p> tags
			count = 1;
			int total = wpList.size();
			for (Iterator<Element> iter = wpList.iterator(); iter.hasNext();) {
				// Element test = (Element)iter.next();
				// System.out.println(test.getName());//new added
				processParagraph((Element) iter.next());
				listener.progress((count * 100) / total);
				// output();
			}

			// output the last file
			output();
		} catch (Exception e) {
			LOGGER.error(
					"Unable to parse/ extract the file in VolumeExtractor:extract",
					e);
			e.printStackTrace();
			throw new ParsingException(e);
		}
	}

	/**
	 * To process a w:p tag
	 * 
	 * output style:text pairs for each paragraph
	 * 
	 * @param wp
	 * @throws JDOMException
	 */
	private void processParagraph(Element wp) throws Exception {
		// read the paragraph style
		Attribute att = (Attribute) XPath.selectSingleNode(wp,
				"./w:pPr/w:pStyle/@w:val");// XXX change from @w:val to w:val
		if (att == null) {// TODO: issue a warning
			if(debug) System.out.println("============================================>null");
			return;
		}
		String style = att.getValue();
		if(debug) System.out.println(style);

		// check if a name paragraph reached, assuming a treatment starts with a
		// Name paragraph
		// if (style.indexOf("Name") >= 0) {
		if (style.matches(start)) {// start = ".*?(Heading|Name).*"
			// The code reaches to a name paragraph
			// output the current treatment file
			// if (treatment != null) {
			if (treatment != null) {
				if (treatment.getChild("paragraph") != null) {
					if (treatment.getChild("paragraph").getChild("text") != null
							&& !treatment.getChild("paragraph")
									.getChild("text").getTextTrim()
									.matches(".*?" + ignorednames + ".*")
							&& treatment.getChildren("paragraph").size() >= 2) { 
						// must contain style and text, must contain >=2 paragraphs
						/*
						 * It is not possible for a treatment to just have a
						 * name Heading4 /Taxa incertae sedis from FoC v22, taxa
						 * whose placement is uncertain
						 */

						output(); // ready to write this treatment out
						count++;
					}
				} else {
					output(); // ready to write this treatment out
					count++;
				}
			}

			// logger.info("processing: " + count);
			// create a new output file
			treatment = new Element("treatment");
		}
		populateTreatment(wp, style);
	}

	/*protected void createTreatment() {
		treatment = new Element("treatment");
	}*/

	protected void populateTreatment(Element wp, String style)
			throws JDOMException {
		Element se = new Element("style");
		se.setText(style);

		Element pe = new Element("paragraph");
		pe.addContent(se);

		if (style.matches(start) || style.matches(names)) {
			extractNameParagraph(wp, pe);
		}else if(style.matches(key)){
			extractKeyParagraph(wp, pe); //try to separate a key "statement" from "determination"
		}else {		
			extractTextParagraph(wp, pe);
		}

		// add the element to the treatment (root) element
		treatment.addContent(pe);
	}

	/**
	 * wp containing the text, to be formated as "statement # determination", then add to pe
	 * @param wp
	 * @param pe
	 */
	private void extractKeyParagraph(Element wp, Element pe) throws JDOMException{
		StringBuffer formatted = new StringBuffer();
		List<Element> text = XPath.selectNodes(wp, "./w:r/w:tab");
		Iterator<Element> it = text.iterator();
		while(it.hasNext()){
			Element t = it.next();			
			t.setText("###");
			t.setName("t");
		}
		
		text = XPath.selectNodes(wp, "./w:r/w:t");
		it = text.iterator();
		while(it.hasNext()){
			Element t = it.next();			
			formatted.append(t.getTextTrim()+" ");
		}
		/*
		List<Element> text = XPath.selectNodes(wp, "./w:r/w:t");
		Iterator<Element> it = text.iterator();
		while(it.hasNext()){
			Element t = it.next();			
			if(t.getAttribute("space", Namespace.XML_NAMESPACE) != null && t.getAttributeValue("space", Namespace.XML_NAMESPACE).compareTo("preserve")==0){
				String temp = t.getTextTrim();
				if(temp.length()>0) formatted.append(" ### ").append(temp+" ");
			}else{
				formatted.append(t.getTextTrim()+" ");
			}
		}
		*/
		
		Element te = new Element("text");
		String t = formatted.toString().trim();
		te.setText(t);
		pe.addContent(te);
		if(keydebug) System.out.println(t);
		
	}

	private void extractNameParagraph(Element wp, Element pe)
			throws JDOMException {
		String acase = "";
		List<Element> rList = XPath.selectNodes(wp, "./w:r");

		for (Iterator <Element>ti = rList.iterator(); ti.hasNext();) {
			Element re = (Element) ti.next();
			// find smallCaps
			Element rpr = (Element) XPath.selectSingleNode(re, "./w:rPr"); // Genus,
																			// Tribe
																			// names
																			// are
																			// in
																			// smallCaps
			if (rpr != null
					&& XPath.selectSingleNode(rpr, "./w:"
							+ tribegennamestyle) != null) {
				acase = tribegennamestyle;
			} else {
				acase = "";
			}
			// collect text
			StringBuffer buffer = new StringBuffer();
			List<Element> textList = XPath.selectNodes(re, "./w:t");
			for (Iterator<Element> it = textList.iterator(); it.hasNext();) {
				Element wt = (Element) it.next();
				String tmp = wt.getText();
				buffer.append(tmp).append(" ");
			}
			// }
			String text = buffer.toString().replaceAll("\\s+", " ").trim();
			;
			// build the elements
			Element te = null;
			if (text.matches(".*?\\S.*")) { // not an empty string or a
											// number of spaces
				te = new Element("text");
				te.setText(text);
			}
			if(debug) System.out.println("Name: " + acase + " : " + text);
			Attribute ca = null;
			if (!acase.equals("") && te != null) {
				ca = new Attribute("case", tribegennamestyle);
				te.setAttribute(ca);
			}
			if (te != null)
				pe.addContent(te);
		}
		
	}

	private void extractTextParagraph(Element wp, Element pe)
			throws JDOMException {
		StringBuffer buffer = new StringBuffer();

		List<Element> textList = XPath.selectNodes(wp, "./w:r/w:t");
		for (Iterator<Element> ti = textList.iterator(); ti.hasNext();) {
			Element wt = (Element) ti.next();
			buffer.append(wt.getText()).append("#");
		}
		String text = buffer.toString().replaceAll("-#", "-")
				.replaceAll("#", "").replaceAll("\\s+", " ").trim();

		/*
		 * buffer.append(wt.getText()).append("-"); } String text =
		 * buffer.toString().replaceAll("\\s+", " ").trim();
		 */
		Element te = new Element("text");
		te.setText(text);
		pe.addContent(te);

	}

	/**
	 * To output the <treatment> element
	 * 
	 * @throws IOException
	 */
	private void output() throws ParsingException {
		try {

			String file = target + "extracted\\" + count + ".xml";
			Document doc = new Document(treatment);
			BufferedOutputStream out = new BufferedOutputStream(
					new FileOutputStream(file));
			/* Producer */
			outputter.output(doc, out);

			/* Consumer */
			listener.info(count + "", file);

		} catch (IOException e) {
			LOGGER.error("Exception in VolumeExtractor : output", e);
			throw new ParsingException(e);
		}
	}

	public static String getStart() {
		return start;
	}

	public static void setStart(String start) {
		VolumeExtractor.start = start;
	}
}