GraphTokenStreamFiniteStrings.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.util.graph;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

/**
 * Consumes a TokenStream and creates an {@link Automaton} where the transition labels are terms from
 * the {@link TermToBytesRefAttribute}.
 * This class also provides helpers to explore the different paths of the {@link Automaton}.
 */
public final class GraphTokenStreamFiniteStrings {
  private final Map<Integer, BytesRef> idToTerm = new HashMap<>();
  private final Map<Integer, Integer> idToInc = new HashMap<>();
  private final Automaton det;
  private final Transition transition = new Transition();

  private class FiniteStringsTokenStream extends TokenStream {
    private final BytesTermAttribute termAtt = addAttribute(BytesTermAttribute.class);
    private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
    private final IntsRef ids;
    private final int end;
    private int offset;

    FiniteStringsTokenStream(final IntsRef ids) {
      assert ids != null;
      this.ids = ids;
      this.offset = ids.offset;
      this.end = ids.offset + ids.length;
    }

    @Override
    public boolean incrementToken() throws IOException {
      if (offset < end) {
        clearAttributes();
        int id = ids.ints[offset];
        termAtt.setBytesRef(idToTerm.get(id));

        int incr = 1;
        if (idToInc.containsKey(id)) {
          incr = idToInc.get(id);
        }
        posIncAtt.setPositionIncrement(incr);
        offset++;
        return true;
      }

      return false;
    }
  }

  public GraphTokenStreamFiniteStrings(TokenStream in) throws IOException {
    Automaton aut = build(in);
    this.det = Operations.removeDeadStates(Operations.determinize(aut, DEFAULT_MAX_DETERMINIZED_STATES));
  }

  /**
   * Returns whether the provided state is the start of multiple side paths of different length (eg: new york, ny)
   */
  public boolean hasSidePath(int state) {
    int numT = det.initTransition(state, transition);
    if (numT <= 1) {
      return false;
    }
    det.getNextTransition(transition);
    int dest = transition.dest;
    for (int i = 1; i < numT; i++) {
      det.getNextTransition(transition);
      if (dest != transition.dest) {
        return true;
      }
    }
    return false;
  }

  /**
   * Returns the list of terms that start at the provided state
   */
  public Term[] getTerms(String field, int state) {
    int numT = det.initTransition(state, transition);
    List<Term> terms = new ArrayList<> ();
    for (int i = 0; i < numT; i++) {
      det.getNextTransition(transition);
      for (int id = transition.min; id <= transition.max; id++) {
        Term term = new Term(field, idToTerm.get(id));
        terms.add(term);
      }
    }
    return terms.toArray(new Term[terms.size()]);
  }

  /**
   * Get all finite strings from the automaton.
   */
  public Iterator<TokenStream> getFiniteStrings() throws IOException {
    return getFiniteStrings(0, -1);
  }


  /**
   * Get all finite strings that start at {@code startState} and end at {@code endState}.
   */
  public Iterator<TokenStream> getFiniteStrings(int startState, int endState) throws IOException {
    final FiniteStringsIterator it = new FiniteStringsIterator(det, startState, endState);
    return new Iterator<TokenStream> () {
      IntsRef current;
      boolean finished = false;

      @Override
      public boolean hasNext() {
        if (finished == false && current == null) {
          current = it.next();
          if (current == null) {
            finished = true;
          }
        }
        return current != null;
      }

      @Override
      public TokenStream next() {
        if (current == null) {
          hasNext();
        }
        TokenStream next =  new FiniteStringsTokenStream(current);
        current = null;
        return next;
      }
    };
  }

  /**
   * Returns the articulation points (or cut vertices) of the graph:
   * https://en.wikipedia.org/wiki/Biconnected_component
   */
  public int[] articulationPoints() {
    if (det.getNumStates() == 0) {
      return new int[0];
    }
    //
    Automaton.Builder undirect = new Automaton.Builder();
    undirect.copy(det);
    for (int i = 0; i < det.getNumStates(); i++) {
      int numT = det.initTransition(i, transition);
      for (int j = 0; j < numT; j++) {
        det.getNextTransition(transition);
        undirect.addTransition(transition.dest, i, transition.min);
      }
    }
    int numStates = det.getNumStates();
    BitSet visited = new BitSet(numStates);
    int[] depth = new int[det.getNumStates()];
    int[] low = new int[det.getNumStates()];
    int[] parent = new int[det.getNumStates()];
    Arrays.fill(parent, -1);
    List<Integer> points = new ArrayList<>();
    articulationPointsRecurse(undirect.finish(), 0, 0, depth, low, parent, visited, points);
    Collections.reverse(points);
    return points.stream().mapToInt(p -> p).toArray();
  }

  /**
   * Build an automaton from the provided {@link TokenStream}.
   */
  private Automaton build(final TokenStream in) throws IOException {
    Automaton.Builder builder = new Automaton.Builder();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);

    in.reset();

    int pos = -1;
    int prevIncr = 1;
    int state = -1;
    while (in.incrementToken()) {
      int currentIncr = posIncAtt.getPositionIncrement();
      if (pos == -1 && currentIncr < 1) {
        throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
      }

      // always use inc 1 while building, but save original increment
      int incr = Math.min(1, currentIncr);
      if (incr > 0) {
        pos += incr;
      }

      int endPos = pos + posLengthAtt.getPositionLength();
      while (state < endPos) {
        state = builder.createState();
      }

      BytesRef term = termBytesAtt.getBytesRef();
      int id = getTermID(currentIncr, prevIncr, term);
      builder.addTransition(pos, endPos, id);

      // only save last increment on non-zero increment in case we have multiple stacked tokens
      if (currentIncr > 0) {
        prevIncr = currentIncr;
      }
    }

    in.end();
    if (state != -1) {
      builder.setAccept(state, true);
    }
    return builder.finish();
  }

  /**
   * Gets an integer id for a given term and saves the position increment if needed.
   */
  private int getTermID(int incr, int prevIncr, BytesRef term) {
    assert term != null;
    boolean isStackedGap = incr == 0 && prevIncr > 1;
    int id = idToTerm.size();
    idToTerm.put(id, BytesRef.deepCopyOf(term));
    // stacked token should have the same increment as original token at this position
    if (isStackedGap) {
      idToInc.put(id, prevIncr);
    } else if (incr > 1) {
      idToInc.put(id, incr);
    }
    return id;
  }

  private static void articulationPointsRecurse(Automaton a, int state, int d, int[] depth, int[] low, int[] parent,
                                                BitSet visited, List<Integer> points) {
    visited.set(state);
    depth[state] = d;
    low[state] = d;
    int childCount = 0;
    boolean isArticulation = false;
    Transition t = new Transition();
    int numT = a.initTransition(state, t);
    for (int i = 0; i < numT; i++) {
      a.getNextTransition(t);
      if (visited.get(t.dest) == false) {
        parent[t.dest] = state;
        articulationPointsRecurse(a, t.dest, d + 1, depth, low, parent, visited, points);
        childCount++;
        if (low[t.dest] >= depth[state]) {
          isArticulation = true;
        }
        low[state] = Math.min(low[state], low[t.dest]);
      } else if (t.dest != parent[state]) {
        low[state] = Math.min(low[state], depth[t.dest]);
      }
    }
    if ((parent[state] != -1 && isArticulation) || (parent[state] == -1 && childCount > 1)) {
      points.add(state);
    }
  }
}