package edu.cmu.minorthird.text.learn; import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.Set; import org.apache.log4j.Level; import edu.cmu.minorthird.classify.AbstractClassificationChecks; import edu.cmu.minorthird.classify.BasicDataset; import edu.cmu.minorthird.classify.ClassLabel; import edu.cmu.minorthird.classify.ClassifierLearner; import edu.cmu.minorthird.classify.Dataset; import edu.cmu.minorthird.classify.Example; import edu.cmu.minorthird.text.BasicTextLabels; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.TextBase; import edu.cmu.minorthird.text.TextLabels; import edu.cmu.minorthird.text.TextLabelsLoader; /** * @author ksteppe */ public abstract class ClassifyTest extends AbstractClassificationChecks{ /** file loading of data */ protected String dataFile; protected String labelsFile; /** text base of training data */ protected TextBase base; protected TextLabels labels; /** testing data */ protected TextBase testBase; protected TextLabels testLabels; /** span checking */ String documentId; protected String labelString; /** defaults for testing */ protected final static SpanFeatureExtractor DEFAULT_SFE= edu.cmu.minorthird.text.learn.SampleFE.BAG_OF_WORDS; public ClassifyTest(String s){ super(s); log.setLevel(Level.DEBUG); } /** * classify with default features, learner, splitter check against given data */ public void classify(double[] referenceData){ checkClassifyText(DEFAULT_SFE,DEFAULT_LEARNER,referenceData); } /** run default classification but output evaluation with no check */ public void benchMarkClassify(){ classify(null); } /** * Base test for classification send null referenceData to get a print out */ public void checkClassifyText(SpanFeatureExtractor fe, ClassifierLearner learner,double[] referenceStats){ try{ Dataset trainData=createDataSet(base,labels,fe); Dataset testData=createDataSet(testBase,testLabels,fe); checkClassify(learner,trainData,testData,referenceStats); }catch(Exception e){ log.fatal(e,e); fail(); } } /** * takes the text base, labels and a feature extractor produces a dataset * * @param fe * @return Dataset */ private Dataset createDataSet(TextBase base,TextLabels labels, edu.cmu.minorthird.text.learn.SpanFeatureExtractor fe){ Dataset data=new BasicDataset(); for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){ Span s=i.next(); // System.out.println( labels ); double label=getLabel(labels,s); // log.info("label: " + s.getDocumentId() + " : is : " + label); data.add(new Example(fe.extractInstance(labels,s),ClassLabel .binaryLabel(label))); } return data; } /** extract labeling for the given span */ protected double getLabel(TextLabels labels,Span s){ double label=labels.hasType(s,labelString)?+1:-1; return label; } /** load labels from file */ private void loadLabels() throws IOException{ // set up the labels labels=new TestTextLabels(base); new TextLabelsLoader().importOps((BasicTextLabels)labels,base,new File( labelsFile)); } /** * check the spans for the loaded labels The test is to ensure that the spans * in the labels and the spans in the text base are the same, with no * 'off-by-one' errors. * * @throws java.io.IOException */ void checkSpans() throws IOException{ loadLabels(); Span baseSpan=base.documentSpan(documentId); // log.info("baseSpan for " + documentId + " is " + baseSpan); log.info("span from "+baseSpan.getDocumentId()+" of size "+baseSpan.size()); Set<String> typeSet=labels.getTypes(); log.info(typeSet.toString()); Span checkSpan=null; for(Iterator<String> iterator=typeSet.iterator();iterator.hasNext();){ String typeName=iterator.next(); // log.info("**************** TYPES: " + typeName + " ********************"); // now get all the stuff with that type for(Iterator<Span> it=base.documentSpanIterator();it.hasNext();){ String id=it.next().getDocumentId(); Set<Span> spanSet=((TestTextLabels)labels).getTypeSet(typeName,id); for(Iterator<Span> spanIt=spanSet.iterator();spanIt.hasNext();){ Span span=spanIt.next(); if(id.equals(documentId)){ log.info(" Document ID: "+id); log.info(" span: "+span.getTextToken(0).asString()+":"+ span.getTextToken(span.size()-1)+" size: "+span.size()); checkSpan=span; } } // spanIt } // it } // iterator for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){ Span s=i.next(); if(s.getDocumentId().equals(documentId)){ log.info(" span: "+s.getTextToken(0).asString()+":"+ s.getTextToken(s.size()-1)+" size: "+s.size()); log.info(" checkSpan: "+checkSpan.getTextToken(0).asString()+ ":"+checkSpan.getTextToken(checkSpan.size()-1)+" size: "+ checkSpan.size()); log.info(new Boolean(checkSpan.equals(s))); assertEquals(checkSpan.size(),s.size()); assertEquals(checkSpan,s); } } } }