package uk.ac.shef.dcs.jate.util; import org.apache.sis.util.StringBuilders; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.core.CoreContainer; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.feature.*; import uk.ac.shef.dcs.jate.indexing.IndexingHandler; import uk.ac.shef.dcs.jate.io.TikaSimpleDocumentCreator; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; /** * Created by zqz on 11/01/17. */ public class RegressionFeatureGenerator { public static void main(String[] args) throws JATEException, IOException, SolrServerException { EmbeddedSolrServer server; CoreContainer testBedContainer = new CoreContainer("/home/zqz/Work/jate/testdata/solr-testbed"); testBedContainer.load(); server = new EmbeddedSolrServer(testBedContainer, "GENIA"); File[] files= new File("/home/zqz/Work/data/jate_data/genia_gs/text/files_standard").listFiles(); List<String> tasks = new ArrayList<>(); JATEProperties prop = new JATEProperties(); for(File f: files) { tasks.add(f.toString()); } //############### this block is for building a testing index ######### /*IndexingHandler indexer = new IndexingHandler(); indexer.index(tasks, 100, new TikaSimpleDocumentCreator(), server, prop); server.close(); System.exit(0);*/ //############## the following is the code for fetching features from the built index ###### WordShapeFBMaster wordShapeFBMaster = new WordShapeFBMaster(server.getCoreContainer().getCore("GENIA").getSearcher().get(), prop,0,null); PositionFeatureMaster positionFeatureMaster = new PositionFeatureMaster(server.getCoreContainer().getCore("GENIA").getSearcher().get(), prop,0); WordShapeFeature wordshapeFeature= (WordShapeFeature)wordShapeFBMaster.build(); PositionFeature positionFeature= (PositionFeature)positionFeatureMaster.build(); int c=0; System.out.println("\nTOTAL="+wordshapeFeature.getAllTerms().size()); for(String term : wordshapeFeature.getAllTerms()){ c++; System.out.println("#"+c+" term (term has been normalised!!!)="+term); System.out.println("\thasAcronymToken:"+wordshapeFeature.getHasAcronymFeature(term)); System.out.println("\thasNumericToken:"+wordshapeFeature.getHasNumberFeature(term)); System.out.println("\thasSymbolChar:"+wordshapeFeature.getHasSymbolFeature(term)); System.out.println("\thasDigitChar:"+wordshapeFeature.getHasDigitFeature(term)); System.out.println("\thasUppercaseChar:"+wordshapeFeature.getHasUppercaseFeature(term)); //frequency found in doc titles Integer inTitle=positionFeature.getFoundInDocTitles(term); if(inTitle==null){ inTitle=0; } System.out.println("\tfrequencyInDocTitle (this can be used to compute a ratio to total frequency):"+inTitle); //PDFT: distances of the source paragraph from the doc title, measured as #of paragraphs from title divided by //total paragraphs in doc List<Double> paragraphDistancesFromTitle=positionFeature.getParDistFromTitle(term); /*if(paragraphDistancesFromTitle.size()>1){ System.out.println(); }*/ double[] minMaxAvgPDFT=calculateMinMaxAvg(paragraphDistancesFromTitle); System.out.println("\tmin PDFT="+minMaxAvgPDFT[0]); System.out.println("\tmax PDFT="+minMaxAvgPDFT[1]); System.out.println("\tavg PDFT="+minMaxAvgPDFT[2]); //SDFT: distances of the source sentence from the doc title, measured as #of sentences from title divided by //total sentences in doc List<Double> sentenceDistancesFromTitle=positionFeature.getSentDistFromTitle(term); double[] minMaxAvgSDFT=calculateMinMaxAvg(sentenceDistancesFromTitle); System.out.println("\tmin SDFT="+minMaxAvgSDFT[0]); System.out.println("\tmax SDFT="+minMaxAvgSDFT[1]); System.out.println("\tavg SDFT="+minMaxAvgSDFT[2]); /*if(sentenceDistancesFromTitle.size()>1){ System.out.println(); }*/ //SDFP: distances of the source sentence from the first sentence of its containing paragraph, measured as // #of sentences from the first sentence divided by //total sentences in the paragraph List<Double> sentenceDistancesFromPar=positionFeature.getSentDistFromPar(term); /*if(sentenceDistancesFromPar.size()>1){ System.out.println(); }*/ double[] minMaxAvgSDFP=calculateMinMaxAvg(sentenceDistancesFromPar); System.out.println("\tmin SDFP="+minMaxAvgSDFP[0]); System.out.println("\tmax SDFP="+minMaxAvgSDFP[1]); System.out.println("\tavg SDFP="+minMaxAvgSDFP[2]); } System.exit(0); } protected static double[] calculateMinMaxAvg(List<Double> numbers){ Collections.sort(numbers); Double max=numbers.get(numbers.size()-1); Double min=numbers.get(0); double total = 0; for(Double d: numbers) total+=d; Double avg=total/(double)numbers.size(); return new double[]{min, max, avg}; } }