package ivory.sqe.retrieval; import ivory.core.RetrievalEnvironment; import ivory.core.data.index.Posting; import ivory.core.data.index.PostingsList; import ivory.core.data.index.PostingsReader; import ivory.core.data.index.ProximityPostingsReaderOrderedWindow; import ivory.smrf.model.GlobalEvidence; import ivory.smrf.model.GlobalTermEvidence; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import com.google.common.base.Preconditions; import com.google.gson.JsonArray; import com.google.gson.JsonObject; public class PostingsReaderWrapper { // Default score for potentials with no postings. protected static final float DEFAULT_SCORE = 0.0f; protected final Posting curPosting = new Posting(); protected RetrievalEnvironment env; protected PostingsReader postingsReader = null; protected GlobalTermEvidence gte; protected GlobalEvidence ge; protected boolean endOfList = true; // Whether or not we're at the end of // the postings list. protected int lastScoredDocno = 0, iterStart = 0, iterStep = 1; protected String operator, termOrPhrase, terms[]; protected JsonArray values; protected List<PostingsReaderWrapper> children; protected boolean isOOV = false; protected float weights[]; private final int numDocs; private final float avgDocLen; public PostingsReaderWrapper(JsonObject query, RetrievalEnvironment env, GlobalEvidence ge) { this.operator = query.entrySet().iterator().next().getKey(); this.values = query.getAsJsonArray(operator); if (operator.equals("#weight") || operator.equals("#combweight")) { iterStart = 1; iterStep = 2; weights = new float[values.size() / 2]; // in #weight or #combweight structure, even-numbered indices corr. to // weights, odd-numbered indices corr. to terms/phrases for (int i = 0; i < values.size(); i = i + iterStep) { weights[i / 2] = (float) values.get(i).getAsDouble(); } } this.env = Preconditions.checkNotNull(env); this.numDocs = (int) env.getDocumentCount(); this.avgDocLen = env.getCollectionSize() / numDocs; // Read first posting. endOfList = false; // If this is not a leaf node, create children children = new ArrayList<PostingsReaderWrapper>(); // //LOG.info("non-leaf node with "+values.length()+" children"); for (int i = iterStart; i < values.size(); i = i + iterStep) { if (!values.get(i).isJsonPrimitive()) { // If child is an object (non-leaf), call nonleaf-constructor children.add(new PostingsReaderWrapper(values.get(i).getAsJsonObject(), env, ge)); } else { // If child is leaf, call leaf-constructor children.add(new PostingsReaderWrapper(values.get(i).getAsString(), env, ge)); } } lastScoredDocno = 0; } public PostingsReaderWrapper(String termOrPhrase, RetrievalEnvironment env, GlobalEvidence ge) { this.env = Preconditions.checkNotNull(env); this.numDocs = (int) env.getDocumentCount(); this.avgDocLen = env.getCollectionSize() / numDocs; // Read first posting. endOfList = false; // If this is a leaf node (i.e., single term), create postings list this.termOrPhrase = termOrPhrase; terms = termOrPhrase.split("\\s+"); if (terms.length > 1) { operator = "phrase"; List<PostingsReader> prs = new ArrayList<PostingsReader>(); for (String term : terms) { PostingsList pl = env.getPostingsList(term); // if any of the tokens is OOV, then the phrase is considered OOV if (pl == null) { isOOV = true; endOfList = true; return; } prs.add(pl.getPostingsReader()); } postingsReader = new ProximityPostingsReaderOrderedWindow(prs.toArray(new PostingsReader[0]), 2); postingsReader.nextPosting(curPosting); gte = new GlobalTermEvidence(env.getDefaultDf(), env.getDefaultCf()); this.ge = ge; lastScoredDocno = 0; } else { operator = "term"; PostingsList pl = env.getPostingsList(termOrPhrase); if (pl == null) { isOOV = true; endOfList = true; } else { postingsReader = pl.getPostingsReader(); gte = new GlobalTermEvidence(pl.getDf(), pl.getCf()); this.ge = ge; lastScoredDocno = 0; } } } public NodeWeight computeScore(int curDocno) { NodeWeight score; if (isOOV) { int docLen = env.getDocumentLength(curDocno); score = new TfDfWeight(0, 0, docLen, numDocs, avgDocLen); } else if (!isLeaf()) { score = runOperator(curDocno); lastScoredDocno = curDocno; } else { // leaf node // Advance postings reader. Invariant: curPosting will always point // to the next posting that has not yet been scored. while (!endOfList && postingsReader.getDocno() < curDocno) { if (!postingsReader.nextPosting(curPosting)) { endOfList = true; } } // Compute term frequency if postings list contains this docno, // otherwise tf=0 int tf = 0; if (curDocno == postingsReader.getDocno()) { tf = postingsReader.getTf(); } int docLen = env.getDocumentLength(curDocno); score = new TfDfWeight(tf, gte.getDf(), docLen, numDocs, avgDocLen); lastScoredDocno = curDocno; } return score; } private NodeWeight runOperator(int curDocno) { // If this is not a leaf node, compute scores from children and // combine them w.r.t operator NodeWeight[] scores = new NodeWeight[children.size()]; for (int i = 0; i < children.size(); i++) { scores[i] = children.get(i).computeScore(curDocno); } int docLen = env.getDocumentLength(curDocno); NodeWeight resultScore; if (operator.equals("#combine")) { // sum bm25 scores float score = 0f; for (int i = 0; i < scores.length; i++) { score += scores[i].getScore(); } resultScore = new FloatWeight(score); } else if (operator.equals("#weight")) { if (scores.length == 0) { resultScore = new FloatWeight(); } else { if (scores[0] instanceof TfDfWeight) { resultScore = new TfDfWeight(0, 0, docLen, numDocs, avgDocLen); } else { resultScore = new FloatWeight(); } // tf,df = sum{weight_i * (tf_i,df_i)} for (int i = 0; i < scores.length; i++) { resultScore.add(scores[i].multiply(weights[i])); } } } else if (operator.equals("#combweight")) { // sum bm25 scores float score = 0f; for (int i = 0; i < scores.length; i++) { score += scores[i].getScore() * weights[i]; } resultScore = new FloatWeight(score); } else { throw new RuntimeException("Unknown operator: " + operator); } return resultScore; } /** * @param docno * @return next smallest docno from posting lists of leaf nodes */ public int getNextCandidate(int docno) { if (isOOV) { return docno; } else if (!isLeaf()) { // not a leaf node for (int i = 0; i < children.size(); i++) { int nextDocno = children.get(i).getNextCandidate(docno); if (nextDocno != lastScoredDocno && nextDocno < docno) { docno = nextDocno; } } return docno; } else { // leaf node if (endOfList) { return Integer.MAX_VALUE; } int nextDocno = findNextDocnoWithPositiveTF(operator); if (nextDocno == Integer.MAX_VALUE) { endOfList = true; } return nextDocno; } } private int findNextDocnoWithPositiveTF(String operator) { boolean t = true; while (t && (postingsReader.getTf() == 0 || postingsReader.getDocno() == lastScoredDocno)) { t = postingsReader.nextPosting(curPosting); } if (t) { return postingsReader.getDocno(); } else { return Integer.MAX_VALUE; } } public void reset() { endOfList = false; lastScoredDocno = -1; } public void setNextCandidate(int docno) { // Advance postings reader. Invariant: curPosting will always point to // the next posting that has not yet been scored. while (!endOfList && postingsReader.getDocno() < docno) { if (!postingsReader.nextPosting(curPosting)) { endOfList = true; } } } public String toString() { if (isOOV) { return "OOV"; } else if (!isLeaf()) { // not a leaf node return operator + "::" + values.toString(); } else { return operator + "::" + Arrays.asList(terms).toString(); } } private boolean isLeaf() { return postingsReader != null; } }