package uk.ac.shef.dcs.jate.io;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.json.simple.parser.ParseException;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.app.App;
import uk.ac.shef.dcs.jate.eval.ATEResultLoader;
import uk.ac.shef.dcs.jate.model.JATETerm;
import uk.ac.shef.dcs.jate.util.SolrUtil;
import java.io.*;
import java.nio.file.Paths;
import java.util.*;
/**
* This class outputs the term candidates and their scores on a per-file basis.
*
*
*/
public class FileBasedOutputWriter {
public static void main(String[] args) throws JATEException, IOException, ParseException {
String outFolder=args[0];
String solrHomePath = args[1];
String solrCoreName = args[2];
final EmbeddedSolrServer solrServer = new EmbeddedSolrServer(Paths.get(solrHomePath), solrCoreName);
JATEProperties jateProp = App.getJateProperties(args[3]);
List<String> predictions=null;
if(args.length>4){
predictions=ATEResultLoader.loadFromJSON(args[4]);
}
output(outFolder, solrServer.getCoreContainer().getCore(solrCoreName),
jateProp, predictions);
}
/**
*
* @param outFolder
* @param predictions if provided, will be used as a filter to only select these that are found within each file
*/
public static void output(String outFolder, SolrCore core, JATEProperties properties, List<String> predictions){
SolrIndexSearcher searcher = core.getSearcher().get();
List<Integer> docIds = new ArrayList<>();
for (int i = 0; i < searcher.maxDoc(); i++) {
docIds.add(i);
}
Set<String> predictionStrings =new HashSet<>();
if(predictions!=null){
predictionStrings.addAll(predictions);
}
int count=0;
for (int docId : docIds) {
count++;
try {
Terms lookupVector = SolrUtil.getTermVector(docId, properties.getSolrFieldNameJATENGramInfo(), searcher);
if(lookupVector==null){
//LOG.error("Term vector for document id="+count+" is null. The document may be empty");
System.err.println("Term vector for document id="+count+" is null. The document may be empty");
continue;
}
String filename=outFolder+ File.separator+new File(searcher.doc(docId).get("id")).getName();
System.out.println(count+","+filename);
Set<String> terms = collectTerms(
lookupVector);
List<String> sorted = new ArrayList<>();
if(predictions==null|| predictionStrings.size()==0)
sorted.addAll(terms);
else{
for(String t: terms){
if(predictionStrings.contains(t))
sorted.add(t);
/* else
System.err.println("This candidate term in this document does not exist: {"+t+" @ "+filename);
*/}
}
Collections.sort(sorted);
PrintWriter p = new PrintWriter(filename);
for(String s: sorted)
p.println(s);
p.close();
} catch (IOException ioe) {
StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
sb.append(docId).append("\n");
sb.append(ExceptionUtils.getFullStackTrace(ioe));
//LOG.error(sb.toString());
} catch (JATEException je) {
StringBuilder sb = new StringBuilder("Unable to build feature for document id:");
sb.append(docId).append("\n");
sb.append(ExceptionUtils.getFullStackTrace(je));
//LOG.error(sb.toString());
}
}
core.close();
System.exit(0);
}
public static Set<String> collectTerms(Terms termVectorLookup) throws IOException {
Set<String> result = new HashSet<>();
TermsEnum tiRef= termVectorLookup.iterator();
BytesRef luceneTerm = tiRef.next();
while (luceneTerm != null) {
if (luceneTerm.length == 0) {
luceneTerm = tiRef.next();
continue;
}
String tString = luceneTerm.utf8ToString();
result.add(tString);
luceneTerm = tiRef.next();
}
return result;
}
}