package edu.cmu.minorthird.ui;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Iterator;
import edu.cmu.minorthird.classify.Dataset;
import edu.cmu.minorthird.classify.DatasetLoader;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.learn.SpanFeatureExtractor;
import edu.cmu.minorthird.util.BasicCommandLineProcessor;
import edu.cmu.minorthird.util.CommandLineProcessor;
import edu.cmu.minorthird.util.JointCommandLineProcessor;
import edu.cmu.minorthird.util.gui.ViewerFrame;
/**
* Preprocess text data for classification.
*
* @author William Cohen
*/
public class PreprocessTextForClassifier extends UIMain{
// private static Logger log=Logger.getLogger(PreprocessTextForClassifier.class);
protected String linkFileName=null;
protected SpanFeatureExtractor fe=new Recommended.DocumentFE();
protected CommandLineUtil.SaveParams save=new CommandLineUtil.SaveParams();
protected CommandLineUtil.ClassificationSignalParams signal=
new CommandLineUtil.ClassificationSignalParams(base);
protected Dataset dataset;
public class LinkFileParams extends BasicCommandLineProcessor{
public String linkFileHelp=
"file to save mapping between examples and spans they correspond to";
public void linkFile(String s){
linkFileName=s;
}
public CommandLineProcessor fe(String s){
fe=
(SpanFeatureExtractor)CommandLineUtil.newObjectFromBSH(s,
SpanFeatureExtractor.class);
return (fe instanceof CommandLineProcessor.Configurable)?tryToGetCLP(fe)
:null;
}
public CommandLineProcessor feOp(){
return tryToGetCLP(fe);
}
@Override
public void usage(){
System.out.println("special parameters:");
System.out.println(" [-linkFile FILE] "+linkFileHelp);
System.out.println(" [-fe beanshell] "+"feature extractor");
System.out.println(" [-feOp opt1 ...] "
+"options for feature extractor");
System.out.println();
}
public String getLinkFileHelp(){
return linkFileHelp;
}
}
@Override
public CommandLineProcessor getCLP(){
return new JointCommandLineProcessor(new CommandLineProcessor[]{
new LinkFileParams(),gui,base,signal,save});
}
public String getLinkFile(){
return linkFileName;
}
public void setLinkFile(String s){
linkFileName=s;
}
public SpanFeatureExtractor getFeatureExtractor(){
return fe;
}
public void setFeatureExtractor(SpanFeatureExtractor fe){
this.fe=fe;
}
public CommandLineUtil.ClassificationSignalParams getSignalParameters(){
return signal;
}
public void setSignalParameters(CommandLineUtil.ClassificationSignalParams p){
signal=p;
}
//
// do it
//
@Override
public void doMain(){
// check that inputs are valid
if(signal.spanProp==null&&signal.spanType==null){
throw new IllegalArgumentException(
"one of -spanProp or -spanType must be specified");
}
if(signal.spanProp!=null&&signal.spanType!=null){
throw new IllegalArgumentException(
"only one of -spanProp or -spanType can be specified");
}
if(save.saveAs==null){
throw new IllegalArgumentException("-saveAs must be specified");
}
// construct the dataset and save it
//if (tagDataFlag) {
// dataset =
//SequenceAnnotatorLearner.prepareSequenceData(base.labels,signal.spanProp,signal.spanType,fe,historySize,reduction);
dataset=
CommandLineUtil.toDataset(base.labels,fe,signal.spanProp,
signal.spanType,signal.candidateType);
try{
DatasetLoader.save(dataset,save.saveAs);
}catch(IOException ex){
System.out.println("error saving dataset to '"+save.saveAs+"': "+ex);
}
if(base.showResult){
new ViewerFrame("Dataset",dataset.toGUI());
}
if(linkFileName!=null){
try{
saveLinkInfo(new File(linkFileName),dataset,save.getSaveAs());
}catch(IOException ex){
System.out.println("error saving link information to '"+linkFileName+
"': "+ex);
}
}
}
private void saveLinkInfo(File linkFile,Dataset dataset,String datasetFileName)
throws IOException{
int lineNo=0;
PrintStream out=new PrintStream(new FileOutputStream(linkFile));
for(Iterator<Example> i=dataset.iterator();i.hasNext();){
Example ex=i.next();
lineNo++;
if(!(ex.getSource() instanceof Span)){
throw new IllegalArgumentException(
"example not associated with a span: "+ex);
}
Span span=(Span)ex.getSource();
out.println(DatasetLoader.getSourceAssignedToExample(datasetFileName,
lineNo)+
" "+
span.getDocumentId()+
" "+
span.getLoChar()+
" "+
(span.getHiChar()-span.getLoChar()));
}
out.close();
}
@Override
public Object getMainResult(){
return dataset;
}
public static void main(String args[]){
new PreprocessTextForClassifier().callMain(args);
}
}