package experiments.collective.entdoccentric.query; import java.util.LinkedList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.Query; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.util.Version; import experiments.collective.entdoccentric.StandardQueryDataObject.EntityObject; import experiments.collective.entdoccentric.LTR.LTRBooleanQuery; import experiments.collective.entdoccentric.LTR.LearnToRankClause; import experiments.collective.entdoccentric.LTR.LearnToRankFuzzyQuery; import experiments.collective.entdoccentric.LTR.LearnToRankQuery; import experiments.collective.entdoccentric.LTR.LearnToRankTermQuery; import experiments.collective.entdoccentric.dpo.EntityToDisambiguate; /** * This class is responsible for feature setup when using the entity based * approach. The respective features must be specified in a single method. */ public class LearnToRankFeatureSetupEntityBased implements LearnToRankFeatureSetup { private List<LearnToRankClause> features; private LearnToRankQuery query; private Analyzer analyzer; public LearnToRankFeatureSetupEntityBased() { this.analyzer = new PositionalPorterStopAnalyzer(Version.LUCENE_41); } public void setMainQuery(LearnToRankQuery query) { this.features = new LinkedList<LearnToRankClause>(); this.query = query; } public void setSubQueries(EntityObject dataObject) { features.add(this.query.add(createFeature1(dataObject), "Feature1", false)); features.add(this.query.add(createFeature2(dataObject), "Feature2", false)); features.add(this.query.add(createFeature3(dataObject), "Feature3", false)); features.add(this.query.add(createFeature4(dataObject), "Feature4", false)); features.add(this.query.add(createFeature5(dataObject), "Feature5", false)); features.add(this.query.add(createFeature6(dataObject), "Feature6", false)); features.add(this.query.add(createFeature7(dataObject), "Feature7", false)); features.add(this.query.add(createFeature8(dataObject), "Feature8", false)); features.add(this.query.add(createFeature9(dataObject), "Feature9", false)); features.add(this.query.add(createFeature10(dataObject), "Feature10", false)); // features.add(this.query.add(createFeature7(dataObject), "Feature7")); // features.add(this.query.add(createFeature8(dataObject), "Feature8")); // features.add(this.query.add(createFeature9(dataObject), "Feature9")); // features.add(this.query.add(createFeature(dataObject), "Feature10")); features.get(0).setWeight(0.0375069f); features.get(1).setWeight(0.001f); features.get(2).setWeight(0.0238851f); features.get(3).setWeight(0.0858324f); features.get(4).setWeight(0.0375069f); features.get(5).setWeight(0.001f); features.get(6).setWeight(0.001f); features.get(7).setWeight(0.001f); features.get(8).setWeight(0.00513431f); features.get(9).setWeight(0.501216f); // features.get(0).setWeight(0.0915161f); // features.get(1).setWeight(0.01771f); // features.get(2).setWeight(0.0450872f); // features.get(3).setWeight(0.115529f); // features.get(4).setWeight(0.0915161f); // features.get(5).setWeight(0.01771f); // features.get(6).setWeight(-0.0468604f); // features.get(7).setWeight(-0.0947746f); // features.get(8).setWeight(0.321587f); // features.get(9).setWeight(0.379994f); // features.get(0).setWeight(1f); // features.get(1).setWeight(1f); // features.get(2).setWeight(1f); // features.get(3).setWeight(1f); // features.get(4).setWeight(1f); // features.get(5).setWeight(1f); // features.get(6).setWeight(1f); // features.get(7).setWeight(1f); // features.get(8).setWeight(1f); // features.get(9).setWeight(1f); // features.get(0).setWeight(0.421661f); // features.get(1).setWeight(0.239444f); // features.get(2).setWeight(0.0219451f); // features.get(3).setWeight(0.155427f); // features.get(4).setWeight(0.134472f); // features.get(5).setWeight(0.001f); // features.get(6).setWeight(0.0870064f); // features.get(7).setWeight(0.465076f); // features.get(6).setWeight(17.4425f); // features.get(7).setWeight(18.5569f); } /** * Feature 1: cos(Lucene-Score) * sim(t_d, q) * * @param keyword * @return */ private Query createFeature1(EntityObject dataObject) { String keyword = dataObject.getText(); DefaultSimilarity defaultSim = new DefaultSimilarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term("title", keyword), defaultSim); return fq; } /** * Feature 2: cos(Lucene-Score) * sim(a_d, q) * * @param dataObject * @return */ private Query createFeature2(EntityObject dataObject) { String keyword = dataObject.getText(); DefaultSimilarity defaultSim = new DefaultSimilarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( "description", keyword), defaultSim); return fq; } /** * Feature 3: cos(Lucene-Score) * sim(t_d, q_c) private Query createFeature1(QueryDataObject dataObject) { String keyword = dataObject.getKeyword(); * * @param dataObject * @return */ private Query createFeature3(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); DefaultSimilarity defaultSim = new DefaultSimilarity(); for (int i = 0; i < split.length; i++) { // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "title", split[i]), defaultSim); LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( "title", split[i]), defaultSim); bq.add(fq, Occur.SHOULD); } return bq; } /** * Feature 4: cos(Lucene-Score) * sim(a_d, q_c) * * @param dataObject * @return */ private Query createFeature4(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); DefaultSimilarity defaultSim = new DefaultSimilarity(); for (int i = 0; i < split.length; i++) { // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "description", usePorterStemmer(split[i])), defaultSim); LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( "description", split[i]), defaultSim); bq.add(fq, Occur.SHOULD); } return bq; } /** * Feature 5: cos(BM25) * sim(t_d, q) * * @param keyword * @return */ private Query createFeature5(EntityObject dataObject) { String keyword = dataObject.getText(); BM25Similarity bm25 = new BM25Similarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term("title", keyword), bm25); return fq; } /** * Feature 6: cos(Bm25) * sim(a_d, q) * * @param dataObject * @return */ private Query createFeature6(EntityObject dataObject) { String keyword = dataObject.getText(); BM25Similarity bm25 = new BM25Similarity(); LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( "description", keyword), bm25); return fq; } /** * Feature 7: cos(BM25) * sim(t_d, q_c) * * @param dataObject * @return */ private Query createFeature7(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); BM25Similarity bm25 = new BM25Similarity(); for (int i = 0; i < split.length; i++) { // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "title", split[i]), bm25); LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( "title", split[i]), bm25); bq.add(fq, Occur.SHOULD); } return bq; } /** * Feature 8: cos(BM25) * sim(a_d, q_c) * * @param dataObject * @return */ private Query createFeature8(EntityObject dataObject) { String sentence = dataObject.getContext(); String[] split = sentence.split(" "); LTRBooleanQuery bq = new LTRBooleanQuery(); BM25Similarity bm25 = new BM25Similarity(); for (int i = 0; i < split.length; i++) { // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "description", usePorterStemmer(split[i])), bm25); LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( "description", split[i]), bm25); bq.add(fq, Occur.SHOULD); } return bq; } // /** // * Feature 1: cos(Lucene-Score) * sim(t_d, q) // * // * @param keyword // * @return // */ // // private Query createFeature1(QueryDataObject dataObject) { // String keyword = dataObject.getKeyword(); // DefaultSimilarity defaultSim = new DefaultSimilarity(); // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term("Label", // keyword), defaultSim); // return fq; // } // // /** // * Feature 2: cos(Lucene-Score) * sim(a_d, q) // * // * @param dataObject // * @return // */ // private Query createFeature2(QueryDataObject dataObject) { // String keyword = dataObject.getKeyword(); // DefaultSimilarity defaultSim = new DefaultSimilarity(); // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "Description", keyword), defaultSim); // return fq; // } // // /** // * Feature 3: cos(Lucene-Score) * sim(t_d, q_c) private Query createFeature1(QueryDataObject dataObject) { // String keyword = dataObject.getKeyword(); // * // * @param dataObject // * @return // */ // private Query createFeature3(QueryDataObject dataObject) { // String sentence = dataObject.getEntityContext(); // String[] split = sentence.split(" "); // LTRBooleanQuery bq = new LTRBooleanQuery(); // DefaultSimilarity defaultSim = new DefaultSimilarity(); // for (int i = 0; i < split.length; i++) { //// LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( //// "title", split[i]), defaultSim); // LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( // "Label", split[i]), defaultSim); // bq.add(fq, Occur.SHOULD); // } // return bq; // } // // /** // * Feature 4: cos(Lucene-Score) * sim(a_d, q_c) // * // * @param dataObject // * @return // */ // private Query createFeature4(QueryDataObject dataObject) { // String sentence = dataObject.getEntityContext(); // String[] split = sentence.split(" "); // LTRBooleanQuery bq = new LTRBooleanQuery(); // DefaultSimilarity defaultSim = new DefaultSimilarity(); // for (int i = 0; i < split.length; i++) { //// LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( //// "description", usePorterStemmer(split[i])), defaultSim); // LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( // "Description", split[i]), defaultSim); // bq.add(fq, Occur.SHOULD); // } // return bq; // } // // /** // * Feature 5: cos(BM25) * sim(t_d, q) // * // * @param keyword // * @return // */ // private Query createFeature5(QueryDataObject dataObject) { // String keyword = dataObject.getKeyword(); // BM25Similarity bm25 = new BM25Similarity(); // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term("Label", // keyword), bm25); // return fq; // } // // /** // * Feature 6: cos(Bm25) * sim(a_d, q) // * // * @param dataObject // * @return // */ // private Query createFeature6(QueryDataObject dataObject) { // String keyword = dataObject.getKeyword(); // BM25Similarity bm25 = new BM25Similarity(); // LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( // "Description", keyword), bm25); // return fq; // } // // /** // * Feature 7: cos(BM25) * sim(t_d, q_c) // * // * @param dataObject // * @return // */ // private Query createFeature7(QueryDataObject dataObject) { // String sentence = dataObject.getEntityContext(); // String[] split = sentence.split(" "); // LTRBooleanQuery bq = new LTRBooleanQuery(); // BM25Similarity bm25 = new BM25Similarity(); // for (int i = 0; i < split.length; i++) { //// LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( //// "title", split[i]), bm25); // LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( // "Label", split[i]), bm25); // bq.add(fq, Occur.SHOULD); // } // return bq; // } // // /** // * Feature 8: cos(BM25) * sim(a_d, q_c) // * // * @param dataObject // * @return // */ // private Query createFeature8(QueryDataObject dataObject) { // String sentence = dataObject.getEntityContext(); // String[] split = sentence.split(" "); // LTRBooleanQuery bq = new LTRBooleanQuery(); // BM25Similarity bm25 = new BM25Similarity(); // for (int i = 0; i < split.length; i++) { //// LearnToRankFuzzyQuery fq = new LearnToRankFuzzyQuery(new Term( //// "description", usePorterStemmer(split[i])), bm25); // LearnToRankTermQuery fq = new LearnToRankTermQuery(new Term( // "Description", split[i]), bm25); // bq.add(fq, Occur.SHOULD); // } // return bq; // } /** * Feature 9: Prior * * @param dataObject * @return */ private Query createFeature9(EntityObject dataObject) { PriorQuery pq = new PriorQuery(); return pq; } /** * Feature 10: SensePrior * * @param dataObject * @return */ private Query createFeature10(EntityObject dataObject) { SensePriorQuery pq = new SensePriorQuery(dataObject.getText()); return pq; } @Override public void setSubQueries(EntityToDisambiguate task) { // TODO Auto-generated method stub } // private String usePorterStemmer(String input) { // String nextToken = ""; // try { // TokenStream source = analyzer.tokenStream(null, new StringReader( // input)); // CharTermAttribute termAtt = source // .addAttribute(CharTermAttribute.class); // source.reset(); // if (source.incrementToken()) { // nextToken = termAtt.toString(); // } // } catch (IOException e) { // e.printStackTrace(); // } // return nextToken; // } // // public void reset() { // analyzer.close(); // } }