package org.apache.lucene.index; /** * Copyright 2007 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.*; import org.apache.lucene.util.BytesRef; /** * Store a sorted collection of {@link org.apache.lucene.index.TermVectorEntry}s. Collects all term information * into a single, SortedSet. * <br/> * NOTE: This Mapper ignores all Field information for the Document. This means that if you are using offset/positions you will not * know what Fields they correlate with. * <br/> * This is not thread-safe */ public class SortedTermVectorMapper extends TermVectorMapper{ private SortedSet<TermVectorEntry> currentSet; private Map<BytesRef,TermVectorEntry> termToTVE = new HashMap<BytesRef,TermVectorEntry>(); private boolean storeOffsets; private boolean storePositions; /** * Stand-in name for the field in {@link TermVectorEntry}. */ public static final String ALL = "_ALL_"; /** * * @param comparator A Comparator for sorting {@link TermVectorEntry}s */ public SortedTermVectorMapper(Comparator<TermVectorEntry> comparator) { this(false, false, comparator); } public SortedTermVectorMapper(boolean ignoringPositions, boolean ignoringOffsets, Comparator<TermVectorEntry> comparator) { super(ignoringPositions, ignoringOffsets); currentSet = new TreeSet<TermVectorEntry>(comparator); } /** * * @param term The term to map * @param frequency The frequency of the term * @param offsets Offset information, may be null * @param positions Position information, may be null */ //We need to combine any previous mentions of the term @Override public void map(BytesRef term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) { TermVectorEntry entry = termToTVE.get(term); if (entry == null) { entry = new TermVectorEntry(ALL, term, frequency, storeOffsets == true ? offsets : null, storePositions == true ? positions : null); termToTVE.put(term, entry); currentSet.add(entry); } else { entry.setFrequency(entry.getFrequency() + frequency); if (storeOffsets) { TermVectorOffsetInfo [] existingOffsets = entry.getOffsets(); //A few diff. cases here: offsets is null, existing offsets is null, both are null, same for positions if (existingOffsets != null && offsets != null && offsets.length > 0) { //copy over the existing offsets TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[existingOffsets.length + offsets.length]; System.arraycopy(existingOffsets, 0, newOffsets, 0, existingOffsets.length); System.arraycopy(offsets, 0, newOffsets, existingOffsets.length, offsets.length); entry.setOffsets(newOffsets); } else if (existingOffsets == null && offsets != null && offsets.length > 0) { entry.setOffsets(offsets); } //else leave it alone } if (storePositions) { int [] existingPositions = entry.getPositions(); if (existingPositions != null && positions != null && positions.length > 0) { int [] newPositions = new int[existingPositions.length + positions.length]; System.arraycopy(existingPositions, 0, newPositions, 0, existingPositions.length); System.arraycopy(positions, 0, newPositions, existingPositions.length, positions.length); entry.setPositions(newPositions); } else if (existingPositions == null && positions != null && positions.length > 0) { entry.setPositions(positions); } } } } @Override public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) { this.storeOffsets = storeOffsets; this.storePositions = storePositions; } /** * The TermVectorEntrySet. A SortedSet of {@link TermVectorEntry} objects. Sort is by the comparator passed into the constructor. *<br/> * This set will be empty until after the mapping process takes place. * * @return The SortedSet of {@link TermVectorEntry}. */ public SortedSet<TermVectorEntry> getTermVectorEntrySet() { return currentSet; } }