package uk.ac.shef.dcs.jate.feature; import org.apache.log4j.Logger; import org.apache.solr.search.SolrIndexSearcher; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import java.util.*; public class FrequencyCtxBasedCopier extends AbstractFeatureBuilder { private static final Logger LOG = Logger.getLogger(FrequencyCtxBasedCopier.class.getName()); private FrequencyCtxBased source; private FrequencyTermBased frequencyFeature; private int frequencyThreshold; public FrequencyCtxBasedCopier(SolrIndexSearcher solrIndexSearcher, JATEProperties properties, FrequencyCtxBased source, FrequencyTermBased frequencyFeature, double topFraction) { super(solrIndexSearcher, properties); this.source=source; this.frequencyFeature =frequencyFeature; List<Integer> frequencies = new ArrayList<>(frequencyFeature.getMapTerm2TTF().values()); Collections.sort(frequencies); int pos = (int)(frequencies.size()*topFraction); this.frequencyThreshold=frequencies.get(frequencies.size()-pos); } @Override public AbstractFeature build() throws JATEException { FrequencyCtxBased result = new FrequencyCtxBased(); LOG.info("Copying features using 1 core, filtering "+frequencyFeature.getMapTerm2TTF().size()+" terms."); Set<String> filteredTerms = new HashSet<>(); int count=0; for(Map.Entry<String, Integer> en: frequencyFeature.getMapTerm2TTF().entrySet()){ count++; if(count%100000==0) LOG.debug(count+"/"+frequencyFeature.getMapTerm2TTF().size()); if(en.getValue()<frequencyThreshold) continue; filteredTerms.add(en.getKey()); } count=0; int countContext=0; LOG.info("Complete filtering, copying for "+filteredTerms.size()+" terms."); for(String ft: filteredTerms){ Set<ContextWindow> ctxx = source.getContexts(ft); if(ctxx==null) continue;//this is possible because candidate term may be incorrectly generated across context (e.g., sentence) boundaries countContext+=ctxx.size(); for(ContextWindow ctx: ctxx){ int tfInCtx=source.getMapCtx2TFIC().get(ctx).get(ft); result.increment(ctx,ft, tfInCtx); result.increment(ctx,tfInCtx); } count++; if(count%100000==0) { LOG.debug(count + "/" + filteredTerms.size() + ", ctxx=" + countContext); countContext=0; } } LOG.info("Complete copying features."); return result; } }