package org.apache.lucene.search.concordance.charoffsets; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.BitSet; import java.util.HashMap; import java.util.Map; /** * Class to record results for looking up normalized terms (String) and * character offsets for specified tokens. Will return NULL_TERM/NULL_OFFSET if * a token offset was not found. * <p> * Has utility methods for safely getting the closest found token. This is * useful for when a concordance window ends in a stop word (no term/offset * info). */ public class RandomAccessCharOffsetContainer { public final static String NULL_TERM = ""; private final static int NULL_OFFSET = -1; private BitSet set = new BitSet(); private int last = -1; private final Map<Integer, String> terms = new HashMap<Integer, String>(); private final Map<Integer, Integer> starts = new HashMap<>(); private final Map<Integer, Integer> ends = new HashMap<>(); /** * @param tokenOffset token of interest * @param startCharOffset start character offset within the string stored in StoredField[fieldIndex] * @param endCharOffset end character offset within the string stored in StoredField[fieldIndex] * @param term string term at that position */ public void add(int tokenOffset, int startCharOffset, int endCharOffset, String term) { addStart(tokenOffset, startCharOffset); addEnd(tokenOffset, endCharOffset); addTerm(tokenOffset, term); set.set(tokenOffset); } private void addTerm(int tokenOffset, String term) { if (term != null) { terms.put(tokenOffset, term); } last = (tokenOffset > last) ? tokenOffset : last; } private void addStart(int tokenOffset, int charOffset) { starts.put(tokenOffset, charOffset); last = (tokenOffset > last) ? tokenOffset : last; } private void addEnd(int tokenOffset, int charOffset) { ends.put(tokenOffset, charOffset); last = (tokenOffset > last) ? tokenOffset : last; } /** * @param tokenOffset target token * @return the character offset for the first character of the tokenOffset. * returns {@link #NULL_OFFSET} if tokenOffset wasn't found */ public int getCharacterOffsetStart(int tokenOffset) { Integer start = starts.get(tokenOffset); if (start == null) { return NULL_OFFSET; } return start.intValue(); } /** * @param tokenOffset target token * @return the character offset for the final character of the tokenOffset. */ public int getCharacterOffsetEnd(int tokenOffset) { Integer end = ends.get(tokenOffset); if (end == null) { return NULL_OFFSET; } return end.intValue(); } /** * @param tokenOffset tokenOffset * @return term stored at this tokenOffset; can return {@link #NULL_TERM} */ public String getTerm(int tokenOffset) { String s = terms.get(tokenOffset); if (s == null) { return NULL_TERM; } return s; } /** * @return last/largest token offset */ public int getLast() { return last; } /** * reset state */ public void clear() { terms.clear(); starts.clear(); ends.clear(); last = -1; set = new BitSet(); } protected boolean isEmpty() { return set.isEmpty(); } /** * Find the closest non-null token starting from startToken * and ending with stopToken (inclusive). * * @param startToken start token * @param stopToken end token * @param map map to use * @return closest non-null token offset to the startToken; can return * {@link #NULL_OFFSET} if no non-null offset was found */ private int getClosestToken(int startToken, int stopToken, Map<Integer, Integer> map) { if (startToken < 0 || stopToken < 0) { return NULL_OFFSET; } if (startToken == stopToken) { return startToken; } if (startToken < stopToken) { for (int i = startToken; i <= stopToken; i++) { Integer charOffset = map.get(i); if (charOffset != null && charOffset != NULL_OFFSET) { return i; } } } else if (startToken > stopToken) { for (int i = startToken; i >= stopToken; i--) { Integer charOffset = map.get(i); if (charOffset != null && charOffset != NULL_OFFSET) { return i; } } } return NULL_OFFSET; } public int getClosestCharStart(int startToken, int stopToken) { int i = getClosestToken(startToken, stopToken, starts); Integer charStart = getCharacterOffsetStart(i); return charStart.intValue(); } public int getClosestCharEnd(int startToken, int stopToken) { int i = getClosestToken(startToken, stopToken, ends); Integer charEnd = getCharacterOffsetEnd(i); return charEnd.intValue(); } protected String getClosestTerm(int startToken, int stopToken) { int i = getClosestToken(startToken, stopToken, starts); return getTerm(i); } /* * return: -1 if public int getFieldIndex(int tokenOffset) { CharCoordinate p = starts.get(tokenOffset); if (p == null) { return NULL_OFFSET; } return p.getFieldIndex(); } */ protected String debugToString() { StringBuilder sb = new StringBuilder(); for (Integer i : terms.keySet()) { sb.append(i + " : " + terms.get(i) + " : " + starts.get(i) + " : " + ends.get(i) + "\n"); } return sb.toString(); } protected BitSet getSet() { return set; } public void remove(int token) { if (token == last) { last = getClosestToken(last - 1, 0, starts); } set.clear(token); terms.remove(token); starts.remove(token); ends.remove(token); } }