package org.apache.lucene.search.concordance.windowvisitor; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.lucene.util.mutable.MutableValueInt; /** * Reusable object that records arrays of terms in the words before a target, in * the target and after a target. It includes information about overall tokens * as well. * <p> * Current implementation chooses reuse vs. security...no defensive copying of arrays. * <p> * See also the classic ConcordanceWindow that records strings for the context before, the * target and the context after the target. */ class ConcordanceArrayWindow { private final static String STOP_WORD = "\u2000"; private final static String FIELD_SEPARATOR = "\u2001"; private final static String EMPTY_STRING = ""; private final static char STOP_CHAR = '\u2000'; private final static char FIELD_SEP_CHAR = '\u2001'; private final static String STOP_WORD_TO_STRING = "_"; private final static String FIELD_SEPARATOR_TO_STRING = " | "; private final int positionIncrementGap; private final StringBuilder sb = new StringBuilder(); private final List<String> pres = new ArrayList<String>(); private final List<String> targs = new ArrayList<String>(); private final List<String> posts = new ArrayList<String>(); private int preSize = 0; private int postSize = 0; private final Map<String, MutableValueInt> tokens = new HashMap<String, MutableValueInt>(); private String target = EMPTY_STRING; /** * @param positionIncrementGap position increment gap used by analyzer */ public ConcordanceArrayWindow(int positionIncrementGap) { this.positionIncrementGap = positionIncrementGap; } /** * escape a string that might be made up entirely * of stop word sentinels or field separator sentinels */ private static String escape(String s) { if (s == null) { return EMPTY_STRING; } else if (allStops(s)) { //double up stop tokens return s + s; } else if (allFieldSeps(s)) { //double up field sep tokens return s + s; } else { return s; } } /** * assumes that this is not called on a true stop word/field separator marker! * */ private static String unescape(String s) { if (s == null) { return EMPTY_STRING; } else if (allStops(s)) { if (s.length() % 2 == 0) { int half = s.length() / 2; s = s.substring(0, half); return s; } else { //um, this shouldn't happen! //TODO: throw exception? return s; } } else if (allFieldSeps(s)) { if (s.length() % 2 == 0) { int half = s.length() / 2; s = s.substring(0, half); return s; } else { //um, this shouldn't happen! //TODO: throw Exception? return s; } } else { return s; } } //is this string entirely made up of field separators private static boolean allFieldSeps(String s) { for (int i = 0; i < s.length(); i++) { if (s.charAt(i) != FIELD_SEP_CHAR) { return false; } } return true; } //is this string made up entirely of stop word sentinels private static boolean allStops(String s) { for (int i = 0; i < s.length(); i++) { if (s.charAt(i) != STOP_CHAR) { return false; } } return true; } /** * Convert token to string * @param token token to convert * @return string */ static String tokenToString(String token) { if (token.equals(STOP_WORD)) { return STOP_WORD_TO_STRING; } else if (token.equals(FIELD_SEPARATOR)) { return FIELD_SEPARATOR_TO_STRING; } token = unescape(token); return token; } /** * @param string token is stop or field separator * @return whether the string is a sentinel for a stop word or field separator */ static boolean isStopOrFieldSeparator(String string) { if (string != null && string.length() == 1) { char c = string.charAt(0); if (c == STOP_CHAR || c == FIELD_SEP_CHAR) { return true; } } return false; } /** * @param string token to test * @return whether the string is a stop word sentinel */ static boolean isStop(String string) { if (string != null && string.length() == 1) { char c = string.charAt(0); if (c == STOP_CHAR) { return true; } } return false; } /** * @param string token to test * @return whether the string is a field separator sentinel */ static boolean isFieldSeparator(String string) { if (string != null && string.length() == 1) { char c = string.charAt(0); if (c == FIELD_SEP_CHAR) { return true; } } return false; } /** * insert a pre token at the beginning of the list of pres * * @param s to insert */ public void insertPre(String s) { s = escape(s); pres.add(0, s); preSize++; } /** * insert a stop word sentinel into the list of pre terms */ public void insertPreStop() { pres.add(0, STOP_WORD); preSize++; } /** * insert a field separator sentinel into the list of pre terms */ public void insertPreFieldSeparator() { pres.add(0, FIELD_SEPARATOR); preSize += positionIncrementGap; } /** * Add a token to the list of pres * * @param token to add to list of pres */ public void addPre(String token) { token = escape(token); pres.add(token); preSize++; } /** * add a stop word sentinel to the list of pres */ public void addPreStop() { pres.add(STOP_WORD); preSize++; } /** * add a field separator sentinel to the list of pres */ public void addPreFieldSeparator() { pres.add(FIELD_SEPARATOR); preSize += positionIncrementGap; } /** * add a token to the targets list * * @param token token to add */ public void addTarget(String token) { token = escape(token); targs.add(token); } /** * add a stop word sentinel to the targets list */ public void addTargetStop() { targs.add(STOP_WORD); } /** * add a field separator sentinel to the targets list */ public void addTargetFieldSeparator() { targs.add(FIELD_SEPARATOR); } /** * add a token to the posts list * * @param token token to add */ public void addPost(String token) { token = escape(token); posts.add(token); postSize++; } /** * add a stop word sentinel to the posts list */ public void addPostStop() { posts.add(STOP_WORD); postSize++; } /** * add a field separator sentinel to the stops list * and increment {@link #postSize} by the positionIncrement */ public void addPostFieldSeparator() { posts.add(FIELD_SEPARATOR); postSize += positionIncrementGap; } /** * @return all tokens and their counts from pres, posts and targets */ public Map<String, MutableValueInt> getAllTokens() { for (int i = 0; i < pres.size(); i++) { String s = pres.get(i); if (s.equals(STOP_WORD) || s.equals(FIELD_SEPARATOR)) { continue; } s = unescape(s); MutableValueInt mutInt = tokens.get(s); if (mutInt == null) { mutInt = new MutableValueInt(); mutInt.value = 0; } mutInt.value++; tokens.put(s, mutInt); } for (int i = 0; i < targs.size(); i++) { String s = targs.get(i); if (s.equals(STOP_WORD) || s.equals(FIELD_SEPARATOR)) { continue; } s = unescape(s); MutableValueInt mutInt = tokens.get(s); if (mutInt == null) { mutInt = new MutableValueInt(); mutInt.value = 0; } mutInt.value++; tokens.put(s, mutInt); } for (int i = 0; i < posts.size(); i++) { String s = posts.get(i); if (s.equals(STOP_WORD) || s.equals(FIELD_SEPARATOR)) { continue; } s = unescape(s); MutableValueInt mutInt = tokens.get(s); if (mutInt == null) { mutInt = new MutableValueInt(); mutInt.value = 0; } mutInt.value++; tokens.put(s, mutInt); } return tokens; } /** * @return target string */ public String getTarget() { return target; } /** * @param t target string */ public void setTarget(String t) { this.target = t; } /** * @return unique tokens in list of pres, targets and posts */ public Set<String> getTypes() { Set<String> set = new HashSet<String>(); for (int i = 0; i < pres.size(); i++) { String s = pres.get(i); if (s.equals(STOP_WORD) || s.equals(FIELD_SEPARATOR)) { continue; } s = unescape(s); set.add(s); } for (int i = 0; i < targs.size(); i++) { String s = targs.get(i); if (s.equals(STOP_WORD) || s.equals(FIELD_SEPARATOR)) { continue; } s = unescape(s); set.add(s); } for (int i = 0; i < posts.size(); i++) { String s = posts.get(i); if (s.equals(STOP_WORD) || s.equals(FIELD_SEPARATOR)) { continue; } s = unescape(s); set.add(s); } return set; } /** * @return string representation of window */ public String toString() { sb.setLength(0); for (int i = 0; i < pres.size() - 1; i++) { sb.append(tokenToString(pres.get(i))).append(" "); } if (pres.size() > 0) { sb.append(tokenToString(pres.get(pres.size() - 1))); } sb.append(">>>").append(target).append("<<<"); for (int i = 0; i < posts.size() - 1; i++) { sb.append(tokenToString(posts.get(i))).append(" "); } if (posts.size() > 0) { sb.append(tokenToString(posts.get(posts.size() - 1))); } return sb.toString(); } /** * reset state. clear arrays */ void reset() { pres.clear(); targs.clear(); posts.clear(); tokens.clear(); target = EMPTY_STRING; sb.setLength(0); preSize = 0; postSize = 0; } /** * @return underlying list of terms before the target. These may include * the raw markers for stop words and/or field separators. * Make sure to handle/unescape appropriately! */ List<String> getRawPreList() { return pres; } /** * @return underlying list of terms in the target. These may include * the raw markers for stop words and/or field separators. * Make sure to handle/unescape appropriately! */ List<String> getRawTargList() { return targs; } /** * @return underlying list of terms after the target. These may include * the raw markers for stop words and/or field separators. * Make sure to handle/unescape appropriately! */ List<String> getRawPostList() { return posts; } /** * @return number of tokens stored in the pre list plus position increments * for field boundaries */ public int getPreSize() { return preSize; } /** * @return number of tokens stored in the post list plus position increments * for field boundaries */ public int getPostSize() { return postSize; } /** * @return positionIncrementGap */ public int getPositionIncrementGap() { return positionIncrementGap; } }