package org.apache.lucene.analysis.shingle; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import java.util.Set; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream; import org.apache.lucene.analysis.payloads.PayloadHelper; import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.FlagsAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.index.Payload; /** * <p>A ShingleMatrixFilter constructs shingles (token n-grams) from a token stream. * In other words, it creates combinations of tokens as a single token. * * <p>For example, the sentence "please divide this sentence into shingles" * might be tokenized into shingles "please divide", "divide this", * "this sentence", "sentence into", and "into shingles". * * <p>Using a shingle filter at index and query time can in some instances * be used to replace phrase queries, especially them with 0 slop. * * <p>Without a spacer character * it can be used to handle composition and decomposition of words * such as searching for "multi dimensional" instead of "multidimensional". * It is a rather common human problem at query time * in several languages, notably the northern Germanic branch. * * <p>Shingles are amongst many things also known to solve problems * in spell checking, language detection and document clustering. * * <p>This filter is backed by a three dimensional column oriented matrix * used to create permutations of the second dimension, the rows, * and leaves the third, the z-axis, for for multi token synonyms. * * <p>In order to use this filter you need to define a way of positioning * the input stream tokens in the matrix. This is done using a * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec}. * There are three simple implementations for demonstrational purposes, * see {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec}, * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec} * and {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec}. * * <p>Consider this token matrix: * <pre> * Token[column][row][z-axis]{ * {{hello}, {greetings, and, salutations}}, * {{world}, {earth}, {tellus}} * }; * </pre> * * It would produce the following 2-3 gram sized shingles: * * <pre> * "hello_world" * "greetings_and" * "greetings_and_salutations" * "and_salutations" * "and_salutations_world" * "salutations_world" * "hello_earth" * "and_salutations_earth" * "salutations_earth" * "hello_tellus" * "and_salutations_tellus" * "salutations_tellus" * </pre> * * <p>This implementation can be rather heap demanding * if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns, * or if each column contains a great number of rows. * * <p>The problem is that in order avoid producing duplicates * the filter needs to keep track of any shingle already produced and returned to the consumer. * * There is a bit of resource management to handle this * but it would of course be much better if the filter was written * so it never created the same shingle more than once in the first place. * * <p>The filter also has basic support for calculating weights for the shingles * based on the weights of the tokens from the input stream, output shingle size, etc. * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}. * <p/> * <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than * the ones located in org.apache.lucene.analysis.tokenattributes. */ public final class ShingleMatrixFilter extends TokenStream { public static Character defaultSpacerCharacter = Character.valueOf('_'); public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec(); public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false; /** * Strategy used to code and decode meta data of the tokens from the input stream * regarding how to position the tokens in the matrix, set and retreive weight, et c. */ public static abstract class TokenSettingsCodec { /** * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. * @param token * @return {@link ShingleMatrixFilter.TokenPositioner} * @throws IOException */ public abstract TokenPositioner getTokenPositioner(Token token) throws IOException; /** * Sets information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. * * @param token * @param tokenPositioner */ public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner); /** * Have this method return 1f in order to 'disable' weights. * @param token * @return the weight of parameter token */ public abstract float getWeight(Token token); /** * Have this method do nothing in order to 'disable' weights. * @param token * @param weight */ public abstract void setWeight(Token token, float weight); } /** * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}. * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token) * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner) */ public static class TokenPositioner { public static final TokenPositioner newColumn = new TokenPositioner(0); public static final TokenPositioner newRow = new TokenPositioner(1); public static final TokenPositioner sameRow = new TokenPositioner(2); private final int index; private TokenPositioner(int index) { this.index = index; } public int getIndex() { return index; } } // filter instance settings variables private TokenSettingsCodec settingsCodec; private int minimumShingleSize; private int maximumShingleSize; private boolean ignoringSinglePrefixOrSuffixShingle = false; private Character spacerCharacter = defaultSpacerCharacter; private TokenStream input; private CharTermAttribute termAtt; private PositionIncrementAttribute posIncrAtt; private PayloadAttribute payloadAtt; private OffsetAttribute offsetAtt; private TypeAttribute typeAtt; private FlagsAttribute flagsAtt; private CharTermAttribute in_termAtt; private PositionIncrementAttribute in_posIncrAtt; private PayloadAttribute in_payloadAtt; private OffsetAttribute in_offsetAtt; private TypeAttribute in_typeAtt; private FlagsAttribute in_flagsAtt; /** * Creates a shingle filter based on a user defined matrix. * * The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor. * todo: don't touch the matrix! use a boolean, set the input stream to null or something, and keep track of where in the matrix we are at. * * @param matrix the input based for creating shingles. Does not need to contain any information until {@link #incrementToken()} is called the first time. * @param minimumShingleSize minimum number of tokens in any shingle. * @param maximumShingleSize maximum number of tokens in any shingle. * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'. * @param settingsCodec codec used to read input token weight and matrix positioning. */ public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { this.matrix = matrix; this.minimumShingleSize = minimumShingleSize; this.maximumShingleSize = maximumShingleSize; this.spacerCharacter = spacerCharacter; this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; this.settingsCodec = settingsCodec; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); // set the input to be an empty token stream, we already have the data. this.input = new EmptyTokenStream(); in_termAtt = input.addAttribute(CharTermAttribute.class); in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); in_payloadAtt = input.addAttribute(PayloadAttribute.class); in_offsetAtt = input.addAttribute(OffsetAttribute.class); in_typeAtt = input.addAttribute(TypeAttribute.class); in_flagsAtt = input.addAttribute(FlagsAttribute.class); } /** * Creates a shingle filter using default settings. * * @see #defaultSpacerCharacter * @see #ignoringSinglePrefixOrSuffixShingleByDefault * @see #defaultSettingsCodec * * @param input stream from which to construct the matrix * @param minimumShingleSize minimum number of tokens in any shingle. * @param maximumShingleSize maximum number of tokens in any shingle. */ public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize) { this(input, minimumShingleSize, maximumShingleSize, defaultSpacerCharacter); } /** * Creates a shingle filter using default settings. * * @see #ignoringSinglePrefixOrSuffixShingleByDefault * @see #defaultSettingsCodec * * @param input stream from which to construct the matrix * @param minimumShingleSize minimum number of tokens in any shingle. * @param maximumShingleSize maximum number of tokens in any shingle. * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. */ public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter) { this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingleByDefault); } /** * Creates a shingle filter using the default {@link TokenSettingsCodec}. * * @see #defaultSettingsCodec * * @param input stream from which to construct the matrix * @param minimumShingleSize minimum number of tokens in any shingle. * @param maximumShingleSize maximum number of tokens in any shingle. * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'. */ public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle) { this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, defaultSettingsCodec); } /** * Creates a shingle filter with ad hoc parameter settings. * * @param input stream from which to construct the matrix * @param minimumShingleSize minimum number of tokens in any shingle. * @param maximumShingleSize maximum number of tokens in any shingle. * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none. * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'. * @param settingsCodec codec used to read input token weight and matrix positioning. */ public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) { this.input = input; this.minimumShingleSize = minimumShingleSize; this.maximumShingleSize = maximumShingleSize; this.spacerCharacter = spacerCharacter; this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; this.settingsCodec = settingsCodec; termAtt = addAttribute(CharTermAttribute.class); posIncrAtt = addAttribute(PositionIncrementAttribute.class); payloadAtt = addAttribute(PayloadAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); typeAtt = addAttribute(TypeAttribute.class); flagsAtt = addAttribute(FlagsAttribute.class); in_termAtt = input.addAttribute(CharTermAttribute.class); in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class); in_payloadAtt = input.addAttribute(PayloadAttribute.class); in_offsetAtt = input.addAttribute(OffsetAttribute.class); in_typeAtt = input.addAttribute(TypeAttribute.class); in_flagsAtt = input.addAttribute(FlagsAttribute.class); } // internal filter instance variables /** iterator over the current matrix row permutations */ private Iterator<Matrix.Column.Row[]> permutations; /** the current permutation of tokens used to produce shingles */ private List<Token> currentPermuationTokens; /** index to what row a token in currentShingleTokens represents*/ private List<Matrix.Column.Row> currentPermutationRows; private int currentPermutationTokensStartOffset; private int currentShingleLength; /** * a set containing shingles that has been the result of a call to {@link #incrementToken()}, * used to avoid producing the same shingle more than once. */ private Set<List<Token>> shinglesSeen = new HashSet<List<Token>>(); @Override public void reset() throws IOException { permutations = null; shinglesSeen.clear(); input.reset(); } private Matrix matrix; private Token reusableToken = new Token(); @Override public final boolean incrementToken() throws IOException { if (matrix == null) { matrix = new Matrix(); // fill matrix with maximumShingleSize columns while (matrix.columns.size() < maximumShingleSize && readColumn()) { // this loop looks ugly } } // this loop exists in order to avoid recursive calls to the next method // as the complexity of a large matrix // then would require a multi gigabyte sized stack. Token token; do { token = produceNextToken(reusableToken); } while (token == request_next_token); if (token == null) return false; clearAttributes(); termAtt.copyBuffer(token.buffer(), 0, token.length()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } private Token getNextInputToken(Token token) throws IOException { if (!input.incrementToken()) return null; token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length()); token.setPositionIncrement(in_posIncrAtt.getPositionIncrement()); token.setFlags(in_flagsAtt.getFlags()); token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset()); token.setType(in_typeAtt.type()); token.setPayload(in_payloadAtt.getPayload()); return token; } private Token getNextToken(Token token) throws IOException { if (!this.incrementToken()) return null; token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setPositionIncrement(posIncrAtt.getPositionIncrement()); token.setFlags(flagsAtt.getFlags()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); return token; } private static final Token request_next_token = new Token(); /** * This method exists in order to avoid recursive calls to the method * as the complexity of a fairly small matrix then easily would require * a gigabyte sized stack per thread. * * @param reusableToken * @return null if exhausted, instance request_next_token if one more call is required for an answer, or instance parameter resuableToken. * @throws IOException */ private Token produceNextToken(final Token reusableToken) throws IOException { if (currentPermuationTokens != null) { currentShingleLength++; if (currentShingleLength + currentPermutationTokensStartOffset <= currentPermuationTokens.size() && currentShingleLength <= maximumShingleSize) { // it is possible to create at least one more shingle of the current matrix permutation if (ignoringSinglePrefixOrSuffixShingle && currentShingleLength == 1 && ((currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isFirst() || (currentPermutationRows.get(currentPermutationTokensStartOffset)).getColumn().isLast())) { return getNextToken(reusableToken); } int termLength = 0; List<Token> shingle = new ArrayList<Token>(currentShingleLength); for (int i = 0; i < currentShingleLength; i++) { Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset); termLength += shingleToken.length(); shingle.add(shingleToken); } if (spacerCharacter != null) { termLength += currentShingleLength - 1; } // only produce shingles that not already has been created if (!shinglesSeen.add(shingle)) { return request_next_token; } // shingle token factory StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future. for (Token shingleToken : shingle) { if (spacerCharacter != null && sb.length() > 0) { sb.append(spacerCharacter); } sb.append(shingleToken.buffer(), 0, shingleToken.length()); } reusableToken.setEmpty().append(sb); updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens); return reusableToken; } else { // it is NOT possible to create one more shingles of the current matrix permutation if (currentPermutationTokensStartOffset < currentPermuationTokens.size() - 1) { // reset shingle size and move one step to the right in the current tokens permutation currentPermutationTokensStartOffset++; currentShingleLength = minimumShingleSize - 1; return request_next_token; } if (permutations == null) { // todo does this ever occur? return null; } if (!permutations.hasNext()) { // load more data (if available) to the matrix if (input != null && readColumn()) { // don't really care, we just read it. } // get rid of resources // delete the first column in the matrix Matrix.Column deletedColumn = matrix.columns.remove(0); // remove all shingles seen that include any of the tokens from the deleted column. List<Token> deletedColumnTokens = new ArrayList<Token>(); for (Matrix.Column.Row row : deletedColumn.getRows()) { for (Token token : row.getTokens()) { deletedColumnTokens.add(token); } } for (Iterator<List<Token>> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) { List<Token> shingle = shinglesSeenIterator.next(); for (Token deletedColumnToken : deletedColumnTokens) { if (shingle.contains(deletedColumnToken)) { shinglesSeenIterator.remove(); break; } } } if (matrix.columns.size() < minimumShingleSize) { // exhausted return null; } // create permutations of the matrix it now looks permutations = matrix.permutationIterator(); } nextTokensPermutation(); return request_next_token; } } if (permutations == null) { permutations = matrix.permutationIterator(); } if (!permutations.hasNext()) { return null; } nextTokensPermutation(); return request_next_token; } /** * get next permutation of row combinations, * creates list of all tokens in the row and * an index from each such token to what row they exist in. * finally resets the current (next) shingle size and offset. */ private void nextTokensPermutation() { Matrix.Column.Row[] rowsPermutation = permutations.next(); List<Matrix.Column.Row> currentPermutationRows = new ArrayList<Matrix.Column.Row>(); List<Token> currentPermuationTokens = new ArrayList<Token>(); for (Matrix.Column.Row row : rowsPermutation) { for (Token token : row.getTokens()) { currentPermuationTokens.add(token); currentPermutationRows.add(row); } } this.currentPermuationTokens = currentPermuationTokens; this.currentPermutationRows = currentPermutationRows; currentPermutationTokensStartOffset = 0; currentShingleLength = minimumShingleSize - 1; } /** * Final touch of a shingle token before it is passed on to the consumer from method {@link #incrementToken()}. * * Calculates and sets type, flags, position increment, start/end offsets and weight. * * @param token Shingle token * @param shingle Tokens used to produce the shingle token. * @param currentPermutationStartOffset Start offset in parameter currentPermutationTokens * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens * @param currentPermuationTokens tokens of the current permutation of rows in the matrix. */ public void updateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens) { token.setType(ShingleMatrixFilter.class.getName()); token.setFlags(0); token.setPositionIncrement(1); token.setStartOffset(shingle.get(0).startOffset()); token.setEndOffset(shingle.get(shingle.size() - 1).endOffset()); settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens)); } /** * Evaluates the new shingle token weight. * * for (shingle part token in shingle) * weight += shingle part token weight * (1 / sqrt(all shingle part token weights summed)) * * This algorithm gives a slightly greater score for longer shingles * and is rather penalising to great shingle token part weights. * * @param shingleToken token returned to consumer * @param shingle tokens the tokens used to produce the shingle token. * @param currentPermutationStartOffset start offset in parameter currentPermutationRows and currentPermutationTokens. * @param currentPermutationRows an index to what matrix row a token in parameter currentPermutationTokens exist. * @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle. * @return weight to be set for parameter shingleToken */ public float calculateShingleWeight(Token shingleToken, List<Token> shingle, int currentPermutationStartOffset, List<Row> currentPermutationRows, List<Token> currentPermuationTokens) { double[] weights = new double[shingle.size()]; double total = 0f; double top = 0d; for (int i=0; i<weights.length; i++) { weights[i] = settingsCodec.getWeight(shingle.get(i)); double tmp = weights[i]; if (tmp > top) { top = tmp; } total += tmp; } double factor = 1d / Math.sqrt(total); double weight = 0d; for (double partWeight : weights) { weight += partWeight * factor; } return (float) weight; } private Token readColumnBuf; /** * Loads one column from the token stream. * * When the last token is read from the token stream it will column.setLast(true); * * @return true if it manage to read one more column from the input token stream * @throws IOException if the matrix source input stream throws an exception */ private boolean readColumn() throws IOException { Token token; if (readColumnBuf != null) { token = readColumnBuf; readColumnBuf = null; } else { token = getNextInputToken(new Token()); } if (token == null) { return false; } Matrix.Column currentReaderColumn = matrix.new Column(); Matrix.Column.Row currentReaderRow = currentReaderColumn.new Row(); currentReaderRow.getTokens().add(token); TokenPositioner tokenPositioner; while ((readColumnBuf = getNextInputToken(new Token())) != null && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) { if (tokenPositioner == TokenPositioner.sameRow) { currentReaderRow.getTokens().add(readColumnBuf); } else /*if (tokenPositioner == TokenPositioner.newRow)*/ { currentReaderRow = currentReaderColumn.new Row(); currentReaderRow.getTokens().add(readColumnBuf); } readColumnBuf = null; } if (readColumnBuf == null) { readColumnBuf = getNextInputToken(new Token()); if (readColumnBuf == null) { currentReaderColumn.setLast(true); } } return true; } /** * A column focused matrix in three dimensions: * * <pre> * Token[column][row][z-axis] { * {{hello}, {greetings, and, salutations}}, * {{world}, {earth}, {tellus}} * }; * </pre> * * todo consider row groups * to indicate that shingles is only to contain permutations with texts in that same row group. * */ public static class Matrix { private boolean columnsHasBeenCreated = false; private List<Column> columns = new ArrayList<Column>(); public List<Column> getColumns() { return columns; } public class Column { private boolean last; private boolean first; public Matrix getMatrix() { return Matrix.this; } public Column(Token token) { this(); Row row = new Row(); row.getTokens().add(token); } public Column() { synchronized (Matrix.this) { if (!columnsHasBeenCreated) { this.setFirst(true); columnsHasBeenCreated = true; } } Matrix.this.columns.add(this); } private List<Row> rows = new ArrayList<Row>(); public List<Row> getRows() { return rows; } public int getIndex() { return Matrix.this.columns.indexOf(this); } @Override public String toString() { return "Column{" + "first=" + first + ", last=" + last + ", rows=" + rows + '}'; } public boolean isFirst() { return first; } public void setFirst(boolean first) { this.first = first; } public void setLast(boolean last) { this.last = last; } public boolean isLast() { return last; } public class Row { public Column getColumn() { return Column.this; } private List<Token> tokens = new LinkedList<Token>(); public Row() { Column.this.rows.add(this); } public int getIndex() { return Column.this.rows.indexOf(this); } public List<Token> getTokens() { return tokens; } public void setTokens(List<Token> tokens) { this.tokens = tokens; } // public int getStartOffset() { // int ret = tokens[0].startOffset(); // if (getIndex() > 0 && ret == 0) { // ret = Column.this.rows.get(0).getStartOffset(); // } // return ret; // } // // public int getEndOffset() { // int ret = tokens[tokens.length - 1].endOffset(); // if (getIndex() > 0 && ret == 0) { // ret = Column.this.rows.get(0).getEndOffset(); // } // return ret; // } @Override public String toString() { return "Row{" + "index=" + getIndex() + ", tokens=" + (tokens == null ? null : tokens) + '}'; } } } public Iterator<Column.Row[]> permutationIterator() { return new Iterator<Column.Row[]>() { private int[] columnRowCounters = new int[columns.size()]; public void remove() { throw new IllegalStateException("not implemented"); } public boolean hasNext() { int s = columnRowCounters.length; int n = columns.size(); return s != 0 && n >= s && columnRowCounters[s - 1] < (columns.get(s - 1)).getRows().size(); } public Column.Row[] next() { if (!hasNext()) { throw new NoSuchElementException("no more elements"); } Column.Row[] rows = new Column.Row[columnRowCounters.length]; for (int i = 0; i < columnRowCounters.length; i++) { rows[i] = columns.get(i).rows.get(columnRowCounters[i]); } incrementColumnRowCounters(); return rows; } private void incrementColumnRowCounters() { for (int i = 0; i < columnRowCounters.length; i++) { columnRowCounters[i]++; if (columnRowCounters[i] == columns.get(i).rows.size() && i < columnRowCounters.length - 1) { columnRowCounters[i] = 0; } else { break; } } } }; } @Override public String toString() { return "Matrix{" + "columns=" + columns + '}'; } } public int getMinimumShingleSize() { return minimumShingleSize; } public void setMinimumShingleSize(int minimumShingleSize) { this.minimumShingleSize = minimumShingleSize; } public int getMaximumShingleSize() { return maximumShingleSize; } public void setMaximumShingleSize(int maximumShingleSize) { this.maximumShingleSize = maximumShingleSize; } public Matrix getMatrix() { return matrix; } public void setMatrix(Matrix matrix) { this.matrix = matrix; } public Character getSpacerCharacter() { return spacerCharacter; } public void setSpacerCharacter(Character spacerCharacter) { this.spacerCharacter = spacerCharacter; } public boolean isIgnoringSinglePrefixOrSuffixShingle() { return ignoringSinglePrefixOrSuffixShingle; } public void setIgnoringSinglePrefixOrSuffixShingle(boolean ignoringSinglePrefixOrSuffixShingle) { this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle; } /** * Using this codec makes a {@link ShingleMatrixFilter} act like {@link org.apache.lucene.analysis.shingle.ShingleFilter}. * It produces the most simple sort of shingles, ignoring token position increments, et c. * * It adds each token as a new column. */ public static class OneDimensionalNonWeightedTokenSettingsCodec extends TokenSettingsCodec { @Override public TokenPositioner getTokenPositioner(Token token) throws IOException { return TokenPositioner.newColumn; } @Override public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { } @Override public float getWeight(Token token) { return 1f; } @Override public void setWeight(Token token, float weight) { } } /** * A codec that creates a two dimensional matrix * by treating tokens from the input stream with 0 position increment * as new rows to the current column. */ public static class TwoDimensionalNonWeightedSynonymTokenSettingsCodec extends TokenSettingsCodec { @Override public TokenPositioner getTokenPositioner(Token token) throws IOException { if (token.getPositionIncrement() == 0) { return TokenPositioner.newRow; } else { return TokenPositioner.newColumn; } } @Override public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { throw new UnsupportedOperationException(); } @Override public float getWeight(Token token) { return 1f; } @Override public void setWeight(Token token, float weight) { } } /** * A full featured codec not to be used for something serious. * * It takes complete control of * payload for weight * and the bit flags for positioning in the matrix. * * Mainly exist for demonstrational purposes. */ public static class SimpleThreeDimensionalTokenSettingsCodec extends TokenSettingsCodec { /** * @param token * @return the token flags int value as TokenPosition * @throws IOException */ @Override public TokenPositioner getTokenPositioner(Token token) throws IOException { switch (token.getFlags()) { case 0: return TokenPositioner.newColumn; case 1: return TokenPositioner.newRow; case 2: return TokenPositioner.sameRow; } throw new IOException("Unknown matrix positioning of token " + token); } /** * Sets the TokenPositioner as token flags int value. * * @param token * @param tokenPositioner */ @Override public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) { token.setFlags(tokenPositioner.getIndex()); } /** * Returns a 32 bit float from the payload, or 1f it null. * * @param token * @return 32 bit float */ @Override public float getWeight(Token token) { if (token.getPayload() == null || token.getPayload().getData() == null) { return 1f; } else { return PayloadHelper.decodeFloat(token.getPayload().getData()); } } /** * Stores a 32 bit float in the payload, or set it to null if 1f; * @param token * @param weight */ @Override public void setWeight(Token token, float weight) { if (weight == 1f) { token.setPayload(null); } else { token.setPayload(new Payload(PayloadHelper.encodeFloat(weight))); } } } }