/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.vectorhighlight; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import org.apache.lucene.search.vectorhighlight.FieldQuery.QueryPhraseMap; import org.apache.lucene.search.vectorhighlight.FieldTermStack.TermInfo; import org.apache.lucene.util.MergedIterator; /** * FieldPhraseList has a list of WeightedPhraseInfo that is used by FragListBuilder * to create a FieldFragList object. */ public class FieldPhraseList { /** * List of non-overlapping WeightedPhraseInfo objects. */ LinkedList<WeightedPhraseInfo> phraseList = new LinkedList<>(); /** * create a FieldPhraseList that has no limit on the number of phrases to analyze * * @param fieldTermStack FieldTermStack object * @param fieldQuery FieldQuery object */ public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery){ this (fieldTermStack, fieldQuery, Integer.MAX_VALUE); } /** * return the list of WeightedPhraseInfo. * * @return phraseList. */ public List<WeightedPhraseInfo> getPhraseList() { return phraseList; } /** * a constructor. * * @param fieldTermStack FieldTermStack object * @param fieldQuery FieldQuery object * @param phraseLimit maximum size of phraseList */ public FieldPhraseList( FieldTermStack fieldTermStack, FieldQuery fieldQuery, int phraseLimit ){ final String field = fieldTermStack.getFieldName(); LinkedList<TermInfo> phraseCandidate = new LinkedList<>(); QueryPhraseMap currMap = null; QueryPhraseMap nextMap = null; while( !fieldTermStack.isEmpty() && (phraseList.size() < phraseLimit) ) { phraseCandidate.clear(); TermInfo ti = null; TermInfo first = null; first = ti = fieldTermStack.pop(); currMap = fieldQuery.getFieldTermMap( field, ti.getText() ); while (currMap == null && ti.getNext() != first) { ti = ti.getNext(); currMap = fieldQuery.getFieldTermMap( field, ti.getText() ); } // if not found, discard top TermInfo from stack, then try next element if( currMap == null ) continue; // if found, search the longest phrase phraseCandidate.add( ti ); while( true ){ first = ti = fieldTermStack.pop(); nextMap = null; if( ti != null ) { nextMap = currMap.getTermMap( ti.getText() ); while (nextMap == null && ti.getNext() != first) { ti = ti.getNext(); nextMap = currMap.getTermMap( ti.getText() ); } } if( ti == null || nextMap == null ){ if( ti != null ) fieldTermStack.push( ti ); if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); } else{ while( phraseCandidate.size() > 1 ){ fieldTermStack.push( phraseCandidate.removeLast() ); currMap = fieldQuery.searchPhrase( field, phraseCandidate ); if( currMap != null ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); break; } } } break; } else{ phraseCandidate.add( ti ); currMap = nextMap; } } } } /** * Merging constructor. * * @param toMerge FieldPhraseLists to merge to build this one */ public FieldPhraseList( FieldPhraseList[] toMerge ) { // Merge all overlapping WeightedPhraseInfos // Step 1. Sort by startOffset, endOffset, and boost, in that order. @SuppressWarnings( { "rawtypes", "unchecked" } ) Iterator< WeightedPhraseInfo >[] allInfos = new Iterator[ toMerge.length ]; int index = 0; for ( FieldPhraseList fplToMerge : toMerge ) { allInfos[ index++ ] = fplToMerge.phraseList.iterator(); } MergedIterator< WeightedPhraseInfo > itr = new MergedIterator<>( false, allInfos ); // Step 2. Walk the sorted list merging infos that overlap phraseList = new LinkedList<>(); if ( !itr.hasNext() ) { return; } List< WeightedPhraseInfo > work = new ArrayList<>(); WeightedPhraseInfo first = itr.next(); work.add( first ); int workEndOffset = first.getEndOffset(); while ( itr.hasNext() ) { WeightedPhraseInfo current = itr.next(); if ( current.getStartOffset() <= workEndOffset ) { workEndOffset = Math.max( workEndOffset, current.getEndOffset() ); work.add( current ); } else { if ( work.size() == 1 ) { phraseList.add( work.get( 0 ) ); work.set( 0, current ); } else { phraseList.add( new WeightedPhraseInfo( work ) ); work.clear(); work.add( current ); } workEndOffset = current.getEndOffset(); } } if ( work.size() == 1 ) { phraseList.add( work.get( 0 ) ); } else { phraseList.add( new WeightedPhraseInfo( work ) ); work.clear(); } } public void addIfNoOverlap( WeightedPhraseInfo wpi ){ for( WeightedPhraseInfo existWpi : getPhraseList() ){ if( existWpi.isOffsetOverlap( wpi ) ) { // WeightedPhraseInfo.addIfNoOverlap() dumps the second part of, for example, hyphenated words (social-economics). // The result is that all informations in TermInfo are lost and not available for further operations. existWpi.getTermsInfos().addAll( wpi.getTermsInfos() ); return; } } getPhraseList().add( wpi ); } /** * Represents the list of term offsets and boost for some text */ public static class WeightedPhraseInfo implements Comparable< WeightedPhraseInfo > { private List<Toffs> termsOffsets; // usually termsOffsets.size() == 1, // but if position-gap > 1 and slop > 0 then size() could be greater than 1 private float boost; // query boost private int seqnum; private ArrayList<TermInfo> termsInfos; /** * Text of the match, calculated on the fly. Use for debugging only. * @return the text */ public String getText() { StringBuilder text = new StringBuilder(); for ( TermInfo ti: termsInfos ) { text.append( ti.getText() ); } return text.toString(); } /** * @return the termsOffsets */ public List<Toffs> getTermsOffsets() { return termsOffsets; } /** * @return the boost */ public float getBoost() { return boost; } /** * @return the termInfos */ public List<TermInfo> getTermsInfos() { return termsInfos; } public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost ){ this( terms, boost, 0 ); } public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){ this.boost = boost; this.seqnum = seqnum; // We keep TermInfos for further operations termsInfos = new ArrayList<>( terms ); termsOffsets = new ArrayList<>( terms.size() ); TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ return; } int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); } else{ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); } pos = ti.getPosition(); } } /** * Merging constructor. Note that this just grabs seqnum from the first info. */ public WeightedPhraseInfo( Collection< WeightedPhraseInfo > toMerge ) { // Pretty much the same idea as merging FieldPhraseLists: // Step 1. Sort by startOffset, endOffset // While we are here merge the boosts and termInfos Iterator< WeightedPhraseInfo > toMergeItr = toMerge.iterator(); if ( !toMergeItr.hasNext() ) { throw new IllegalArgumentException( "toMerge must contain at least one WeightedPhraseInfo." ); } WeightedPhraseInfo first = toMergeItr.next(); @SuppressWarnings( { "rawtypes", "unchecked" } ) Iterator< Toffs >[] allToffs = new Iterator[ toMerge.size() ]; termsInfos = new ArrayList<>(); seqnum = first.seqnum; boost = first.boost; allToffs[ 0 ] = first.termsOffsets.iterator(); int index = 1; while ( toMergeItr.hasNext() ) { WeightedPhraseInfo info = toMergeItr.next(); boost += info.boost; termsInfos.addAll( info.termsInfos ); allToffs[ index++ ] = info.termsOffsets.iterator(); } // Step 2. Walk the sorted list merging overlaps MergedIterator< Toffs > itr = new MergedIterator<>( false, allToffs ); termsOffsets = new ArrayList<>(); if ( !itr.hasNext() ) { return; } Toffs work = itr.next(); while ( itr.hasNext() ) { Toffs current = itr.next(); if ( current.startOffset <= work.endOffset ) { work.endOffset = Math.max( work.endOffset, current.endOffset ); } else { termsOffsets.add( work ); work = current; } } termsOffsets.add( work ); } public int getStartOffset(){ return termsOffsets.get( 0 ).startOffset; } public int getEndOffset(){ return termsOffsets.get( termsOffsets.size() - 1 ).endOffset; } public boolean isOffsetOverlap( WeightedPhraseInfo other ){ int so = getStartOffset(); int eo = getEndOffset(); int oso = other.getStartOffset(); int oeo = other.getEndOffset(); if( so <= oso && oso < eo ) return true; if( so < oeo && oeo <= eo ) return true; if( oso <= so && so < oeo ) return true; if( oso < eo && eo <= oeo ) return true; return false; } @Override public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( getText() ).append( '(' ).append( boost ).append( ")(" ); for( Toffs to : termsOffsets ){ sb.append( to ); } sb.append( ')' ); return sb.toString(); } /** * @return the seqnum */ public int getSeqnum() { return seqnum; } @Override public int compareTo( WeightedPhraseInfo other ) { int diff = getStartOffset() - other.getStartOffset(); if ( diff != 0 ) { return diff; } diff = getEndOffset() - other.getEndOffset(); if ( diff != 0 ) { return diff; } return (int) Math.signum( getBoost() - other.getBoost() ); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + getStartOffset(); result = prime * result + getEndOffset(); long b = Double.doubleToLongBits( getBoost() ); result = prime * result + ( int )( b ^ ( b >>> 32 ) ); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } WeightedPhraseInfo other = (WeightedPhraseInfo) obj; if (getStartOffset() != other.getStartOffset()) { return false; } if (getEndOffset() != other.getEndOffset()) { return false; } if (getBoost() != other.getBoost()) { return false; } return true; } /** * Term offsets (start + end) */ public static class Toffs implements Comparable< Toffs > { private int startOffset; private int endOffset; public Toffs( int startOffset, int endOffset ){ this.startOffset = startOffset; this.endOffset = endOffset; } public void setEndOffset( int endOffset ){ this.endOffset = endOffset; } public int getStartOffset(){ return startOffset; } public int getEndOffset(){ return endOffset; } @Override public int compareTo( Toffs other ) { int diff = getStartOffset() - other.getStartOffset(); if ( diff != 0 ) { return diff; } return getEndOffset() - other.getEndOffset(); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + getStartOffset(); result = prime * result + getEndOffset(); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } Toffs other = (Toffs) obj; if (getStartOffset() != other.getStartOffset()) { return false; } if (getEndOffset() != other.getEndOffset()) { return false; } return true; } @Override public String toString(){ StringBuilder sb = new StringBuilder(); sb.append( '(' ).append( startOffset ).append( ',' ).append( endOffset ).append( ')' ); return sb.toString(); } } } }