package edu.cmu.minorthird.ui; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; import java.util.Iterator; import edu.cmu.minorthird.classify.DatasetLoader; import edu.cmu.minorthird.classify.Example; import edu.cmu.minorthird.classify.sequential.SequenceDataset; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.learn.BeginContinueEndUniqueReduction; import edu.cmu.minorthird.text.learn.Extraction2TaggingReduction; import edu.cmu.minorthird.text.learn.SequenceAnnotatorLearner; import edu.cmu.minorthird.util.BasicCommandLineProcessor; import edu.cmu.minorthird.util.CommandLineProcessor; import edu.cmu.minorthird.util.JointCommandLineProcessor; import edu.cmu.minorthird.util.StringUtil; import edu.cmu.minorthird.util.gui.ViewerFrame; /** * Preprocess extraction text data for sequential learning methods. * * @author William Cohen */ public class PreprocessTextForExtractor extends PreprocessTextForClassifier{ protected int historySize=1; protected Extraction2TaggingReduction reduction= new BeginContinueEndUniqueReduction(); public PreprocessTextForExtractor(){ super(); this.fe=new Recommended.TokenFE(); } public class ExtractionReductionParams extends BasicCommandLineProcessor{ public void history(String s){ historySize=StringUtil.atoi(s); } public void reduction(String s){ reduction= (Extraction2TaggingReduction)CommandLineUtil.newObjectFromBSH(s, Extraction2TaggingReduction.class); } } public void usage(){ System.out.println("extraction-related parameters:"); System.out.println(" [-history N] " +"number of previous classes to use as features"); System.out.println(" [-reduction beanshell] " +"how to map tokens to classes"); System.out.println(); } @Override public CommandLineProcessor getCLP(){ return new JointCommandLineProcessor(new CommandLineProcessor[]{ new LinkFileParams(),new ExtractionReductionParams(),gui,base,signal, save}); } public int getHistorySize(){ return historySize; } public void setHistorySize(int n){ this.historySize=n; } public Extraction2TaggingReduction getReduction(){ return reduction; } public void setReduction(Extraction2TaggingReduction r){ this.reduction=r; } // // do it // @Override public void doMain(){ // check that inputs are valid if(signal.spanProp==null&&signal.spanType==null){ throw new IllegalArgumentException( "one of -spanProp or -spanType must be specified"); } if(signal.spanProp!=null&&signal.spanType!=null){ throw new IllegalArgumentException( "only one of -spanProp or -spanType can be specified"); } if(save.saveAs==null){ throw new IllegalArgumentException("-saveAs must be specified"); } dataset= SequenceAnnotatorLearner.prepareSequenceData(base.labels, signal.spanType,signal.spanProp,fe,historySize,reduction); try{ DatasetLoader.saveSequence((SequenceDataset)dataset,save.saveAs); }catch(IOException ex){ System.out.println("error saving sequential dataset to '"+save.saveAs+ "': "+ex); } if(base.showResult){ new ViewerFrame("Dataset",dataset.toGUI()); } if(linkFileName!=null){ try{ saveLinkInfoSequence(new File(linkFileName),(SequenceDataset)dataset, save.getSaveAs()); }catch(IOException ex){ System.out.println("error saving link information to '"+linkFileName+ "': "+ex); } } } private void saveLinkInfoSequence(File linkFile,SequenceDataset dataset, String datasetFileName) throws IOException{ int lineNo=0; PrintStream out=new PrintStream(new FileOutputStream(linkFile)); for(Iterator<Example[]> i=dataset.sequenceIterator();i.hasNext();){ Example[] seq=i.next(); for(int j=0;j<seq.length;j++){ Example ex=seq[j]; lineNo++; if(!(ex.getSource() instanceof Span)){ throw new IllegalArgumentException( "example not associated with a span: "+ex); } Span span=(Span)ex.getSource(); out.println(DatasetLoader.getSourceAssignedToExample(datasetFileName, lineNo)+ " "+ span.getDocumentId()+ " "+ span.getLoChar()+ " "+ (span.getHiChar()-span.getLoChar())); } lineNo++; // count a line for the sequence terminator '*' } out.close(); } public static void main(String args[]){ new PreprocessTextForExtractor().callMain(args); } }