/** * */ package fna.parsing; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Hashtable; import org.apache.log4j.Logger; import org.jdom.Document; import org.jdom.Element; import org.jdom.input.SAXBuilder; import org.jdom.output.XMLOutputter; import org.jdom.xpath.XPath; import org.jdom.Content; import org.jdom.Text; import fna.db.*; /** * @author hongcui * split taxonX/other XML documents to smaller units, each resulting xml document contains 1 treatment. * */ @SuppressWarnings({ "unchecked", "unused" }) public abstract class Type4Transformer extends Thread { private File source =new File(Registry.SourceDirectory); //a folder of xml documents to be annotated File target = new File(Registry.TargetDirectory); //File target = new File("Z:\\DATA\\Plazi\\2ndFetchFromPlazi\\target-taxonX-ants-trash"); //private String tableprefix = "plazi_ants"; private XMLOutputter outputter = null; // this is the dataprfix from general tab private String dataprefix = null; protected ProcessListener listener; protected static final Logger LOGGER = Logger.getLogger(CharacterStatementsTransformer.class); /** * */ public Type4Transformer(ProcessListener listener, String dataprefix) { this.listener = listener; this.dataprefix = dataprefix; /* Remove this hardcoding later*/ //dataprefix = "plazi_ants"; if(!target.exists()){ target.mkdir(); } Utilities.resetFolder(target, "descriptions"); Utilities.resetFolder(target, "transformed"); Utilities.resetFolder(target, "descriptions-dehyphened"); Utilities.resetFolder(target, "markedup"); Utilities.resetFolder(target, "final"); Utilities.resetFolder(target, "co-occurrence"); } public void run(){ listener.setProgressBarVisible(true); transform(); listener.setProgressBarVisible(false); } public void transform(){ File[] files = source.listFiles(); //create renaming mapping table Hashtable<String, String> filemapping = new Hashtable<String, String>(); listener.progress(1); for(int f = 0; f < files.length; f++) { listener.progress((100*(f+1))/files.length); int fn = f+1; System.out.println (files[f].getName()+" to "+ (f+1)+".xml"); filemapping.put(files[f].getName(), (f+1)+".xml"); } Type4TransformerDbAccessor t4tdb = new Type4TransformerDbAccessor("filenamemapping", dataprefix); t4tdb.addRecords(filemapping); //transform XML transformXML(files); } protected abstract void transformXML(File[] files); protected Element formatDescription(Element treatment, String descriptionXPath, String paraXPath, int fn, int count) { try{ Element description = (Element)XPath.selectSingleNode(treatment, descriptionXPath); if(description==null){ return treatment; }else{ if(paraXPath != null){ List<Element> ps = XPath.selectNodes(description, paraXPath); Iterator<Element> it = ps.iterator(); int i = 0; while(it.hasNext()){ Element p = it.next(); p.setName("description"); p.setAttribute("pid", fn+"_"+count+".txtp"+i); p.setNamespace(null); i++; } }else{ //no paraXPath is given, make the description element the only one description.setName("description"); description.setAttribute("pid", fn+"_"+count+".txtp0"); description.setNamespace(null); } return treatment; } }catch(Exception e){ e.printStackTrace(); LOGGER.error("Type4Transformer : error.", e); } return null; } protected void getDescriptionFrom(Element root, int fn, int count) { try{ List<Element> divs = XPath.selectNodes(root, "/tax:taxonx/tax:taxonxBody/tax:treatment/tax:div"); Iterator<Element> it = divs.iterator(); int i = 0; while(it.hasNext()){ Element div = it.next(); if(div.getAttributeValue("type").compareToIgnoreCase("description")==0){ //List<Element> ps = div.getChildren("p", div.getNamespace()); List<Element> ps = div.getChildren("description"); Iterator<Element> t = ps.iterator(); while(t.hasNext()){ Element p = t.next(); int size = p.getContentSize(); StringBuffer sb = new StringBuffer(); for(int c = 0; c < size; c++){ Content cont = p.getContent(c); if(cont instanceof Element){ sb.append(((Element)cont).getTextNormalize()+" "); }else if(cont instanceof Text){ sb.append(((Text)cont).getTextNormalize()+" "); } } //writeDescription2Descriptions(sb.toString(), fn+"_"+count+"_"+i); //record the position for each paragraph. writeDescription2Descriptions(sb.toString(), fn+"_"+count+".txtp"+i); //record the position for each paragraph. i++; } } } }catch(Exception e){ e.printStackTrace(); } } protected void writeDescription2Descriptions(String textNormalize, String fn) { try { File file = new File(target+System.getProperty("file.separator")+"descriptions", fn+ ".txt"); BufferedWriter out = new BufferedWriter(new FileWriter(file)); out.write(textNormalize); out.close(); // don't forget to close the output stream!!! } catch (IOException e) { e.printStackTrace(); LOGGER.error("Failed to output text file in Type4Transformer:outputDescriptionText", e); throw new ParsingException("Failed to output text file.", e); } } protected void writeTreatment2Transformed(Element root, int fn, int count) { ParsingUtil.outputXML(root, new File(target+System.getProperty("file.separator")+"transformed", fn+"_"+count+".xml"), null); } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub //Type4Transformer t4t = new Type4Transformer(); //t4t.transform(); } }