/**
*
*/
package fna.parsing;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import org.apache.log4j.Logger;
import org.eclipse.swt.widgets.Display;
import org.eclipse.swt.widgets.Text;
import org.jdom.output.Format;
import org.jdom.output.XMLOutputter;
/**
* @author hongcui
*
* transform NeXML chars/states to a suitable format for CharaParser
* expect 1 NeXML file per PDF paper (original pub) in source folder
*/
public abstract class CharacterStatementsTransformer extends Thread {
protected ArrayList<String> seeds = new ArrayList<String>();
//private File source =new File(Registry.SourceDirectory); //a folder of text documents to be annotated
//private File source = new File("Z:\\WorkFeb2008\\WordNov2009\\Description_Extraction\\extractionSource\\Plain_text");
protected File source = new File(Registry.SourceDirectory);
//File target = new File(Registry.TargetDirectory);
//File target = new File("Z:\\WorkFeb2008\\WordNov2009\\Description_Extraction\\extractionSource\\Plain_text_transformed");
protected File target = new File(Registry.TargetDirectory);
protected static final Logger LOGGER = Logger.getLogger(CharacterStatementsTransformer.class);
protected String seedfilename = "seeds";
protected ProcessListener listener;
protected Text perlLog;
protected XMLOutputter outputter;
protected PhraseMarker pm;
//protected String prefix;
//protected String glossarytable;
CharacterStatementsTransformer(ProcessListener listener, Display display,
Text perllog, ArrayList<String> seeds/*, String prefix, String glossarytable*/){
this.seeds = seeds;
this.listener = listener;
this.perlLog = perllog;
//this.prefix = prefix;
//this.glossarytable = glossarytable;
pm = new PhraseMarker();
this.outputter = new XMLOutputter(Format.getPrettyFormat());
setXPaths();
File target = new File(Registry.TargetDirectory);
Utilities.resetFolder(target, "descriptions");
Utilities.resetFolder(target, "transformed");
//TermOutputerUtilities.resetFolder(target, "descriptions-dehyphened");
Utilities.resetFolder(target, "markedup");
Utilities.resetFolder(target, "final");
Utilities.resetFolder(target, "co-occurrence");
}
protected abstract void setXPaths();
/**
* create folders:
* descriptions for state label attributes
* characters for char elements
* transformed for entire NeXML documents
*/
private void output2Target() {
File des = createFolderIn(target, "descriptions");
File tra = createFolderIn(target, "transformed");
File cha = createFolderIn(target, "characters");
File[] files = this.source.listFiles();
listener.progress(30);
for(int i = 0; i<files.length; i++){
String fname = files[i].getName();
outputTo(des,cha, tra,files[i]);
/* Show on the table - show from transformed folder --
* put a listener progress here
* .*/
listener.info((i+1) + "", fname.replaceAll("\\..*$", "")+".xml");
listener.progress((90* i)/files.length);
}
listener.progress(60);
}
protected abstract void outputTo(File desfolder, File chafolder, File trafolder, File file);
protected void write2file(File desfolder, String fname, String text) {
try{
BufferedWriter out = new BufferedWriter(
new FileWriter(new File(desfolder, fname)));
text = pm.markPhrases(text); //phrases are connected via "_" and become words.
out.write(text);
out.flush();
out.close();
}catch(IOException e){
LOGGER.error("Exception in Type3PreMarkup.write2file", e);
}
}
private File createFolderIn(File target, String foldername) {
File nfile = new File(target, foldername);
if(nfile.mkdir()){
return nfile;
}else{
nfile.renameTo(new File(target, nfile.getName()+""+System.currentTimeMillis()));
if(nfile.mkdir()){
return nfile;
}
}
return nfile;
}
public void run () {
listener.setProgressBarVisible(true);
System.out.println("compiled");
output2Target();
listener.setProgressBarVisible(false);
}
}