package uk.ac.shef.dcs.jate.feature;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.search.SolrIndexSearcher;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.util.SolrUtil;
import java.io.IOException;
import java.util.*;
/**
*
*/
public abstract class AbstractFeatureBuilder {
protected SolrIndexSearcher solrIndexSearcher;
protected JATEProperties properties;
/**
* setting SEQUENTIAL_THRESHOLD (or MAX_TASKS_PER_WORKER) to a good-in-practice value is a trade-off.
* The documentation for the ForkJoin framework suggests creating parallel subtasks until
* the number of basic computation steps is somewhere over 100 and less than 10,000.
*
* The exact number is not crucial provided you avoid extremes.
*
* @see <a href="http://homes.cs.washington.edu/~djg/teachingMaterials/grossmanSPAC_forkJoinFramework.html"/>
* @see <a href="http://stackoverflow.com/questions/19925820/fork-join-collecting-results/19926423#19926423"/>
*/
protected final static int MIN_SEQUENTIAL_THRESHOLD = 100;
protected final static int MAX_SEQUENTIAL_THRESHOLD = 10000;
public AbstractFeatureBuilder(SolrIndexSearcher solrIndexSearcher, JATEProperties properties){
this.solrIndexSearcher=solrIndexSearcher;
this.properties=properties;
}
public abstract AbstractFeature build() throws JATEException;
protected Set<String> getUniqueWords() throws JATEException, IOException {
Terms ngramInfo = SolrUtil.getTermVector(properties.getSolrFieldNameJATENGramInfo(), solrIndexSearcher);
TermsEnum termsEnum = ngramInfo.iterator();
Set<String> allWords = new HashSet<>();
while (termsEnum.next() != null) {
BytesRef t = termsEnum.term();
if (t.length == 0)
continue;
String termStr=t.utf8ToString();
if(!termStr.contains(" "))
allWords.add(termStr);
}
if(allWords.size()==0)
throw new JATEException("MWEMetadata are required on 'Words', however there are no single-token lexical units in the "+
properties.getSolrFieldNameJATENGramInfo()+" field. Check to see if your analyzer pipeline outputs uni-grams");
return allWords;
}
/**
* Retrieve term candidates from solr field
* see @code {uk.ac.shef.dcs.jate.JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS}
*
* The method assumes that the term candidates are extracted at index-time and stored in pre-configured field
*
* @return Set, a set of term candidate surface form
* @throws JATEException
* @throws IOException
*/
protected Set<String> getUniqueTerms() throws JATEException, IOException {
Terms terms =SolrUtil.getTermVector(properties.getSolrFieldNameJATECTerms(),solrIndexSearcher);
//>>>>>>>>>
/*TermsEnum source = terms.iterator();
String term = //"thrownawayorusedjustforelementarystatistical profile";
"l hierar hy";
//"ordertoavoidadependencyofthebaselineresultontherandom";
if (source.seekExact(new BytesRef(term.getBytes("UTF-8")))) {
PostingsEnum docEnum = source.postings(null);
int doc = 0;
while ((doc = docEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
int tfid = docEnum.freq(); //tf in document
}
} else {
}*/
//>>>>>>>>>
TermsEnum termsEnum = terms.iterator();
Set<String> allTermCandidates = new HashSet<>();
while (termsEnum.next() != null) {
BytesRef t = termsEnum.term();
if (t.length == 0)
continue;
allTermCandidates.add(t.utf8ToString());
if(t.utf8ToString().equals("l hierar hy"))
System.out.println();
}
return allTermCandidates;
}
}