package edu.isi.karma.semantictypes.tfIdf; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import edu.isi.karma.modeling.semantictypes.SemanticTypeLabel; /** * This class is responsible for predicting top-k suggestions for textual data * using TF-IDF based cosine similarity approach and checking if a document for * a semantic label already exists * * @author ramnandan * */ public class Searcher { private IndexSearcher indexSearcher = null; private Analyzer analyzer = null; private QueryParser parser = null; public Searcher(String filepath, String fieldName) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File( filepath))); indexSearcher = new IndexSearcher(reader); analyzer = new StandardAnalyzer(Version.LUCENE_48); if (fieldName.equalsIgnoreCase(Indexer.LABEL_FIELD_NAME)) { parser = new QueryParser(Version.LUCENE_48, Indexer.LABEL_FIELD_NAME, analyzer); } else { parser = new QueryParser(Version.LUCENE_48, Indexer.CONTENT_FIELD_NAME, analyzer); } } public List<SemanticTypeLabel> getTopK(int k, String content) throws ParseException, IOException { List<SemanticTypeLabel> result = new ArrayList<>(); content = content.toLowerCase().replaceAll("and", " ").replaceAll("or", " ").replaceAll("\\+", "").replaceAll("\\-", ""); int spaces = content.length() - content.replace(" ", "").length(); if (spaces > BooleanQuery.getMaxClauseCount()) { BooleanQuery.setMaxClauseCount(spaces); } //System.out.println("Query: " + content); Query query = parser.parse(QueryParser.escape(content)); TopDocs results = indexSearcher.search(query, k); ScoreDoc[] hits = results.scoreDocs; //System.out.println("Num Hits:" + hits.length); for (int i = 0; i < hits.length; i++) { Document doc = indexSearcher.doc(hits[i].doc); String labelString = doc.get(Indexer.LABEL_FIELD_NAME); result.add(new SemanticTypeLabel(labelString, hits[i].score)); } return result; } public Document getDocumentForLabel(String label) throws IOException { Query query = new TermQuery( new Term(Indexer.LABEL_FIELD_NAME, label)); TopDocs results = indexSearcher.search(query, 10); ScoreDoc[] hits = results.scoreDocs; for(int i=0; i<hits.length; i++) { Document doc = indexSearcher.doc(hits[i].doc); String labelString = doc.get(Indexer.LABEL_FIELD_NAME); if (labelString.equalsIgnoreCase(label)) // document for // exact semantic // label already // exists { return doc; } } return null; } public void close() { try { indexSearcher.getIndexReader().close(); } catch (IOException e) { // Ignore } } }