GATEDocument.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  GATEDocument.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Valentin Tablan, 24 Feb 2009
 *
 *  $Id$
 */
package gate.mimir.index;

import gate.Annotation;
import gate.AnnotationSet;
import gate.mimir.IndexConfig;
import gate.util.OffsetComparator;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.di.big.mg4j.document.Document;

import java.io.*;
import java.util.Arrays;
import java.util.concurrent.BlockingQueue;

import org.apache.log4j.Logger;


/**
 * An implementation of MG4J Document interface for representing GATE documents
 * during the indexing process.
 */
public class GATEDocument implements Document {
  /**
   * The URI prefix used for generating document URIs, when no explicit URI is
   * provided as a document feature.
   * The actual URIs will comprise this value with a number appended, generated
   * by {@link #documentID}++. 
   */
  private static final String DOCUMENT_URI_PREFIX = "urn:mimir:document:";

  /**
   * A reader used to satisfy the MG4J interfaces, but that provides no actual
   * data.
   */
  private static final Reader emptyReader = new StringReader("");

  
  private static Logger logger = Logger.getLogger(GATEDocument.class);
  
  /**
   * Used to generate unique document URIs, if no URIs are provided as document 
   * features.
   */
  private static long documentID = 0;
  
  /**
   * The number of occurrences (in all sub-indexes) generated as a result of 
   * indexing this document.
   */
  private long occurrences = 0;
  
  /**
   * An MG4J word reader for this document.
   */
  private class GATEDocumentWordReader implements WordReader{
    /**
     * the index of the next token
     */
    private int index = 0;
    
    /**
     * The token feature from which the data is read. 
     */
    private String tokenFeature;
    
    /**
     * Constructs a GATE Document reader.
     * @param tokens an array of token annotations, sorted by offset.  
     * @param nonTokens an array of string, representing the non-tokens (the 
     * document content between tokens). 
     * @param tokenFeature the name of the feature to be read from the token 
     * annotations.
     */
    public GATEDocumentWordReader(String tokenFeature){
      this.tokenFeature = tokenFeature;
    }
    
    /* (non-Javadoc)
     * @see it.unimi.dsi.io.WordReader#copy()
     */
    public WordReader copy() {
      return this;
    }

    /* (non-Javadoc)
     * @see it.unimi.dsi.io.WordReader#next(it.unimi.dsi.lang.MutableString, it.unimi.dsi.lang.MutableString)
     */
    public boolean next(MutableString word, MutableString nonWord)
            throws IOException {
      if(index < tokenAnnots.length){
        word.replace((String)tokenAnnots[index].getFeatures().get(tokenFeature));
        nonWord.replace(nonTokens[index]);
        index++;
        return true;
      }else{
        return false;  
      }
    }

    /* (non-Javadoc)
     * @see it.unimi.dsi.io.WordReader#setReader(java.io.Reader)
     */
    public WordReader setReader(Reader reader) {
      if(reader != emptyReader) 
        throw new UnsupportedOperationException(getClass().getName() + 
              " does not support resetting!");
      return this;
    }
    
  }
  
  /**
   * The index config for this document
   */
  private IndexConfig indexConfig;
  
  /**
   * The queue where this document should add itself upon closing.
   */
  private BlockingQueue<GATEDocument> outputQueue;
  
  /**
   * The GATE Document wrapped by this object.
   */
  private gate.Document gateDocument;
  
  /**
   * A list of all the token annotations, sorted by offset. 
   */
  private Annotation[] tokenAnnots;
  
  /**
   * A list containing all the strings between tokens.
   */
  private String[] nonTokens;
  
  /**
   * A special instance of GATEDocument used to mark the end of a queue.
   */
  public static final GATEDocument END_OF_QUEUE = new GATEDocument();
  
  /**
   * Private constructor used to create the {@link #END_OF_QUEUE} instance.
   */
  protected GATEDocument(){
  }
  
  public GATEDocument(gate.Document gateDocument,
          IndexConfig indexConfig){
    this.gateDocument = gateDocument;
    this.indexConfig = indexConfig;
    
    //build the list of tokens
    AnnotationSet tokenSet = indexConfig.getTokenAnnotationSetName() == null?
            gateDocument.getAnnotations() :
            gateDocument.getAnnotations(indexConfig.getTokenAnnotationSetName());  
    AnnotationSet allTokens = null;
    if(tokenSet != null) {
      synchronized(tokenSet) {
        allTokens = tokenSet.get(indexConfig
                        .getTokenAnnotationType());
      }
    }
    if(allTokens != null && allTokens.size() > 0){
      //we have some tokens
      tokenAnnots = allTokens.toArray(new Annotation[allTokens.size()]);
      Arrays.sort(tokenAnnots, new OffsetComparator());
    }else{
      //no tokens
      tokenAnnots = new Annotation[0];
    }
    //build the list of non-tokens
    nonTokens = new String[tokenAnnots.length];
    String docContent = gateDocument.getContent().toString();
    //for each token, add the doc content after it (and before the next token)
    //to the nonTokens array. 
    for(int i = 0; i < tokenAnnots.length - 1; i++){
      int nonTokenStart = tokenAnnots[i].getEndNode().getOffset().intValue();
      int nonTokenEnd = tokenAnnots[i+1].getStartNode().getOffset().intValue();
      nonTokens[i] = (nonTokenStart < nonTokenEnd) ?
              docContent.substring(nonTokenStart, nonTokenEnd) : "";
    }
    //set the last value to all remaining document content, if we have any tokens
    if(tokenAnnots.length > 0){
      int nonTokenStart = tokenAnnots[tokenAnnots.length - 1].getEndNode().
          getOffset().intValue();
      nonTokens[nonTokens.length -1] = (nonTokenStart < docContent.length()) ?
              docContent.substring(nonTokenStart) : "";
    }
  }
  
  /* (non-Javadoc)
   * @see it.unimi.dsi.mg4j.document.Document#close()
   */
  public void close() throws IOException {
    // put the finished document in the output queue 
    try {
      outputQueue.put(this);
    } catch(InterruptedException e) {
      Thread.currentThread().interrupt();
    }
  }

  
  /**
   * Sets the output queue for this document. When the {@link #close()} method
   * is called, this document will add itself to the output queue.  
   * @param outputQueue the outputQueue to set
   */
  public void setOutputQueue(BlockingQueue<GATEDocument> outputQueue) {
    this.outputQueue = outputQueue;
  }

  
  /**
   * Obtains the GATE document wrapped by this object.
   * @return the gateDocument
   */
  public gate.Document getDocument() {
    return gateDocument;
  }

  /* (non-Javadoc)
   * @see it.unimi.dsi.mg4j.document.Document#content(int)
   */
  public Object content(int field) throws IOException {
    return emptyReader;
  }

  /* (non-Javadoc)
   * @see it.unimi.dsi.mg4j.document.Document#title()
   */
  public CharSequence title() {
    return gateDocument.getName();
  }

  /* (non-Javadoc)
   * @see it.unimi.dsi.mg4j.document.Document#uri()
   */
  public synchronized CharSequence uri() {
    String uri = (String)gateDocument.getFeatures().get(
            indexConfig.getDocumentUriFeatureName());
    if(uri == null){
      uri = DOCUMENT_URI_PREFIX + documentID;
      logger.warn(
        "No document URI provided, generating a default one: " + documentID);
      documentID++;
      gateDocument.getFeatures().put(
              indexConfig.getDocumentUriFeatureName(), uri);
    }
    return uri;
  }

  /**
   * Notifies this GATEDocument that some more index occurrences were produced
   * in the process of indexing it.
   * 
   * This method is synchronized because the same GATEDocument instance is being
   * indexed in parallel by multiple sub-indexers.
   *  
   * @param newOccurrences the number of new occurrences generated
   */
  public synchronized void addOccurrences(long newOccurrences) {
    occurrences += newOccurrences;
  }
  
  /**
   * Returns the number of index occurrences that the indexing of this 
   * GATEDocument has generated. This value is only correct after the document
   * has been indexed by all sub-indexers.
   * 
   * @return the number of occurrences.
   */
  public long getOccurrences() {
    return occurrences;
  }

  /* (non-Javadoc)
   * @see it.unimi.dsi.mg4j.document.Document#wordReader(int)
   */
  public WordReader wordReader(int field) {
    return new GATEDocumentWordReader(
            indexConfig.getTokenIndexers()[field].getFeatureName());
  }

  /**
   * Gets the array of offset-sorted token annotations for this document.
   * The value returned is the actual internally used array, so modifications 
   * can lead to undefined behaviour! 
   * @return the tokenAnnots
   */
  public Annotation[] getTokenAnnots() {
    return tokenAnnots;
  }

  /**
   * Gets the array of string representing the document content segments between
   * the token annotations.
   * The value returned is the actual internally used array, so modifications 
   * can lead to undefined behaviour!
   * @return the nonTokens
   */
  public String[] getNonTokens() {
    return nonTokens;
  }
  
  
}