OriginalMarkupMetadataHelper.java example

Explorer

mimir-master
- etc
  - generated-header.java
- mimir-client
  - src
    - gate
      - mimir
        index
        MimirConnector.java
        MimirIndexingPR.java
        search
        RemoteQueryRunner.java
        tool
        WebUtils.java
- mimir-cloud
  - archive-unpacker
    - src
      - gate
        mimir
        util
        MultiFileInputStream.java
        UnpackWizard.java
  - src
    - java
      - gate
        mimir
        util
        IndexArchiveState.java
        MultiFileOutputStream.java
- mimir-core
  - src
    - gate
      - mimir
        AbstractSemanticAnnotationHelper.java
        Constraint.java
        ConstraintType.java
        DocumentMetadataHelper.java
        DocumentRenderer.java
        IndexConfig.java
        MimirIndex.java
        SemanticAnnotationHelper.java
        index
        AtomicAnnotationIndex.java
        AtomicIndex.java
        AtomicTokenIndex.java
        DocumentCollection.java
        DocumentData.java
        GATEDocument.java
        GATEDocumentFactory.java
        IndexException.java
        Mention.java
        OriginalMarkupMetadataHelper.java
        package-info.java
        search
        FederatedQueryRunner.java
        IndexReaderPool.java
        QueryEngine.java
        QueryRunner.java
        RankingQueryRunnerImpl.java
        query
        AbstractIntersectionQueryExecutor.java
        AbstractOverlapQuery.java
        AbstractQueryExecutor.java
        AndQuery.java
        AnnotationQuery.java
        Binding.java
        ConstQuery.java
        ContainsQuery.java
        ExecutorsList.java
        GapQuery.java
        MinusQuery.java
        OrQuery.java
        QueryExecutor.java
        QueryNode.java
        RepeatsQuery.java
        SequenceQuery.java
        TermQuery.java
        WithinQuery.java
        parser
        ParseException.java
        Query.java
        QueryParser.java
        QueryParserConstants.java
        QueryParserTokenManager.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        score
        BindingScorer.java
        DelegatingScoringQueryExecutor.java
        MimirScorer.java
        terms
        AbstractCompoundTermsQuery.java
        AbstractDocumentsBasedTermsQuery.java
        AbstractIndexTermsQuery.java
        AndTermsQuery.java
        AnnotationTermsQuery.java
        CompoundTermsQuery.java
        ConstTermsQuery.java
        DocumentTermsQuery.java
        DocumentsAndTermsQuery.java
        DocumentsBasedTermsQuery.java
        DocumentsOrTermsQuery.java
        LimitTermsQuery.java
        OrTermsQuery.java
        SortedTermsQuery.java
        TermTypeTermsQuery.java
        TermsQuery.java
        TermsResultSet.java
        util
        DefaultMentionDescriber.java
        DelegatingSemanticAnnotationHelper.java
        DocumentFeaturesMetadataHelper.java
        IgnoreEmptiesTermProcessor.java
        IndexUpgrader.java
        MG4JTools.java
        NormalizingTermProcessor.java
        OntologyMentionDescriber.java
        TruncateIndex.java
- mimir-test
  - src
    - gate
      - mimir
        test
        QueryTests.java
        RenderZipCollection.java
        Scratch.java
        ScratchConsole.java
        TestQueryParser.java
        TestUtils.java
- mimir-web
  - src
    - gwt
      - gate
        mimir
        web
        client
        UI.java
    - java
      - gate
        mimir
        util
        LogAnalyser.java
        web
        client
        DocumentData.java
        GwtRpcService.java
        GwtRpcServiceAsync.java
        MimirSearchException.java
        ResultsData.java
- plugins
  - db-h2
    - src
      - gate
        mimir
        db
        AnnotationTemplateCache.java
        DBSemanticAnnotationHelper.java
  - measurements
    - src
      - gate
        mimir
        measurements
        MeasurementAnnotationHelper.java
        MeasurementPluginResource.java
  - sparql
    - src
      - gate
        mimir
        sparql
        RequestMethod.java
        SPARQLResultSet.java
        SPARQLSemanticAnnotationHelper.java

/*
 *  OriginalMarkupMetadataHelper.java
 *
 *  Copyright (c) 2007-2011, The University of Sheffield.
 *
 *  This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), 
 *  and is free software, licenced under the GNU Lesser General Public License,
 *  Version 3, June 2007 (also included with this distribution as file
 *  LICENCE-LGPL3.html).
 *
 *  Valentin Tablan, 7 Oct 2009
 *
 *  $Id$
 */
package gate.mimir.index;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;

import gate.Annotation;
import gate.AnnotationSet;
import gate.GateConstants;
import gate.mimir.DocumentMetadataHelper;
import gate.mimir.DocumentRenderer;
import gate.mimir.search.query.Binding;

/**
 * An implementation of {@link DocumentMetadataHelper} and 
 * {@link DocumentRenderer} that imports relevant markup tags from the indexed
 * document's original markups annotation set, saves them as document metadata
 * in the zip collection, and then uses these saved values to render the
 * document at search time.
 * 
 * The metadata saved by this class is stored in the main document metadata map 
 * using this class's name as a key. The value save is itself a Map, with 
 * multiple metadata fields. 
 */
public class OriginalMarkupMetadataHelper implements DocumentMetadataHelper, 
    DocumentRenderer {
  
  /**
   * Creates a new document helper/renderer.
   * @param markupAnnTypes the types of annotations from the original markups 
   * set that should be saved as document metadata (and used for rendering 
   * documents). If the value given for this parameter is <code>null</code>, 
   * then the default set of tags is used (see {@link #DEFAULT_TAG_TYPES}). 
   */
  public OriginalMarkupMetadataHelper(Set<String> markupAnnTypes) {
    if(markupAnnTypes == null){
      markupAnnTypes = new HashSet<String>();
      for(String aTag : DEFAULT_TAG_TYPES) markupAnnTypes.add(aTag);
    }
    this.markupAnnTypes = markupAnnTypes;
  }

  /**
   * Renders a document, using the saved original markup tags, and adding tags
   * for the provided query hits. This will generate HTML (XML, XHTML, etc) 
   * content by printing out the document tokens, spaces, and tags.
   * 
   * @param documentData the {@link DocumentData} object for the document to be
   * rendered.
   * @param hits the list of hits that need to also be rendered.
   * @param output a {@link Appendable} to which the output is written. 
   * @see gate.mimir.DocumentRenderer#render(gate.mimir.index.DocumentData, java.util.List, java.lang.Appendable)
   */
  public void render(DocumentData documentData, List<Binding> hits,
          Appendable output) throws IOException {
    String[] tokens = documentData.getTokens();
    String[] nonTokens = documentData.getNonTokens();
    
    DocumentTags docTags = (DocumentTags)getMetadataField(documentData, TAGS_KEY);
    //tags that have been opened and need to close:
    //key = token offset for close tag
    //value: list of tag IDs that end at that location
    SortedMap<Integer, LinkedList<String>> spansToEnd = 
      new TreeMap<Integer, LinkedList<String>>();
    Iterator<int[]> tagIter = docTags.tags != null ? 
            docTags.tags.iterator() : null;
    int[] currentTag = (tagIter != null && tagIter.hasNext()) ? 
            tagIter.next() : null;
    Iterator<Binding> hitIter = hits != null ? hits.iterator() : null;
    Binding currentHit = (hitIter != null && hitIter.hasNext()) ? 
            hitIter.next() : null;
    for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++){
      if(docTags != null){
        //check if we need to open any tags here
        while((currentTag != null && currentTag[1] == tokIdx) ||
              (currentHit != null && currentHit.getTermPosition() == tokIdx)){
          //we need to open a tag or a hit
          if(currentTag != null && currentTag[1] == tokIdx &&
             currentHit != null && currentHit.getTermPosition() == tokIdx){
            //we have both a tag and a hit, starting at the same position
            //we start the one that ends later, with a preference for a tag
            //(as hits should be inner-most)
            if(currentTag[2] >= (currentHit.getTermPosition() + currentHit.getLength())){
              //consume the TAG
              String openingTag = docTags.tagDescriptors.get(currentTag[0]);
              output.append(openingTag);
              String closingTag = getClosingTag(openingTag);
              if(currentTag[2] == -1) {
                // zero-length tag
                output.append(closingTag);
              } else {
                LinkedList<String> spans = spansToEnd.get(currentTag[2]);
                if(spans == null){
                  spans = new LinkedList<String>();
                  spansToEnd.put(currentTag[2], spans);
                }
                spans.addFirst(closingTag);                
              }
              //consume the tag
              currentTag = (tagIter != null && tagIter.hasNext()) ? 
                      tagIter.next() : null;
            }else{
              //consume the HIT
              output.append(HIT_OPENING_TAG);
              int spanEnd = currentHit.getTermPosition() + currentHit.getLength() -1; 
              LinkedList<String> spans = spansToEnd.get(spanEnd);
              if(spans == null){
                spans = new LinkedList<String>();
                spansToEnd.put(spanEnd, spans);
              }
              spans.addFirst(HIT_CLOSING_TAG);
              //consume the hit
              currentHit = (hitIter != null && hitIter.hasNext()) ? 
                      hitIter.next() : null;
            }
          }else if(currentTag != null && currentTag[1] == tokIdx){
            //we only have a TAG to use
            String openingTag = docTags.tagDescriptors.get(currentTag[0]);
            output.append(openingTag);
            String closingTag = getClosingTag(openingTag);
            if(currentTag[2] == -1) {
              // zero-length tag
              output.append(closingTag);
            } else {
              LinkedList<String> spans = spansToEnd.get(currentTag[2]);
              if(spans == null){
                spans = new LinkedList<String>();
                spansToEnd.put(currentTag[2], spans);
              }
              spans.addFirst(closingTag);                
            }
            //consume the tag
            currentTag = (tagIter != null && tagIter.hasNext()) ? 
                    tagIter.next() : null;
          }else{
            //we only have a HIT to use
            output.append(HIT_OPENING_TAG);
            int spanEnd = currentHit.getTermPosition() + currentHit.getLength() -1;
            LinkedList<String> spans = spansToEnd.get(spanEnd);
            if(spans == null){
              spans = new LinkedList<String>();
              spansToEnd.put(spanEnd, spans);
            }
            spans.addFirst(HIT_CLOSING_TAG);
            //consume the hit
            currentHit = (hitIter != null && hitIter.hasNext()) ? 
                    hitIter.next() : null;
          }
        }
      }
      //write the token
      output.append(tokens[tokIdx]);
      
      //check if we need to close any spans here
      while(spansToEnd.size() > 0 && spansToEnd.firstKey() == tokIdx){
        LinkedList<String> closingTags = spansToEnd.remove(spansToEnd.firstKey());
        for(String aTag : closingTags){
          output.append(aTag);
        }
      }
      //write the non-token, if any
      if(tokIdx < nonTokens.length) output.append(nonTokens[tokIdx]);

    }
  }

  /* (non-Javadoc)
   * @see gate.mimir.index.DocumentMetadataHelper#documentEnd(gate.Document, gate.mimir.index.mg4j.zipcollection.DocumentData)
   */
  public void documentEnd(GATEDocument document, DocumentData documentData) {
    //here we need to store the relevant markup as a metadata field
    DocumentTags documentTags = new DocumentTags();
    //a list of annotations to save
    AnnotationSet omSet = document.getDocument().getAnnotations(
            GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
    AnnotationSet tagsSet = null;
    synchronized(omSet) {
      tagsSet = omSet.get(markupAnnTypes);
    }
    List<Annotation> tagsToSave = new ArrayList<Annotation>(tagsSet);
    Collections.sort(tagsToSave, new StartComparator());
    //a structure holding the tags that need to be closed.
    //key (long): the document offset where the tag should close
    //value (int): the index in the tags array of the current document metadata
    SortedMap<Long, LinkedList<Integer>> tagsToEnd = 
      new TreeMap<Long, LinkedList<Integer>>();
    Annotation[] tokens = document.getTokenAnnots();
    Iterator<Annotation> tagsiter = tagsToSave.iterator();
    Annotation currentTag = tagsiter.hasNext() ? tagsiter.next() : null;
    long tagStart = currentTag == null ? -1 : currentTag.getStartNode().getOffset();
    long tagEnd = currentTag == null ? -1 : currentTag.getEndNode().getOffset();
    for(int tokIdx = 0; tokIdx < tokens.length; tokIdx++) {
      long tokStart = tokens[tokIdx].getStartNode().getOffset();
      long tokEnd = tokens[tokIdx].getEndNode().getOffset();
      //see if there are any tags to close at this offset
      while(tagsToEnd.size() > 0 && tagsToEnd.firstKey() <= tokStart){
        //get all tags ending inside the previous token or the space before the 
        //current token
        LinkedList<Integer> tags = tagsToEnd.remove(tagsToEnd.firstKey());
        for(int aTag : tags){
          documentTags.tags.get(aTag)[2] = tokIdx -1;
        }
      }
      //see if we need to save any tags at this offset
      while(currentTag != null){
        if(tagStart < tokEnd){
          //the current tag starts within the current token
          int tagDescId = getTagId(currentTag, documentTags);
          documentTags.tags.add(new int[]{tagDescId, tokIdx, -1});
          if(tagEnd <= tokStart){
            // the tag starts and ends before the current token starts, so it's
            // either zero-length, or whitespace-only
            // leave the end position as -1.
          } else {
            // not a zero-length tag, 
            // so we'll need to find the closing position later
            LinkedList<Integer> tagsEnding = tagsToEnd.get(tagEnd);
            if(tagsEnding == null){
              tagsEnding = new LinkedList<Integer>();
              tagsToEnd.put(tagEnd, tagsEnding);
            }
            tagsEnding.addFirst(documentTags.tags.size() -1);            
          }
          //update the current tag
          currentTag = tagsiter.hasNext() ? tagsiter.next() : null;
          tagStart = currentTag == null ? -1 : currentTag.getStartNode().getOffset();
          tagEnd = currentTag == null ? -1 : currentTag.getEndNode().getOffset();
        }else{
          //current tag not inside the current token yet.
          break;
        }
      }//while currentTag != null
    }//for tokens
    while(tagsToEnd.size() > 0){
      //we did not close all tags yet
      int tokIdx = tokens.length -1;
      LinkedList<Integer> tags = tagsToEnd.remove(tagsToEnd.firstKey());
      for(int aTag : tags){
        documentTags.tags.get(aTag)[2] = tokIdx;
      }
    }
    
    while(currentTag != null){
      //we did not exhaust all tags, we'll assign all remaining tags to the last
      //token
      int tokIdx = tokens.length -1;
      int tagDescId = getTagId(currentTag, documentTags);
      documentTags.tags.add(new int[]{tagDescId, tokIdx, tokIdx});
      //update the current tag
      currentTag = tagsiter.hasNext() ? tagsiter.next() : null;
      tagStart = currentTag == null ? -1 : currentTag.getStartNode().getOffset();
      tagEnd = currentTag == null ? -1 : currentTag.getEndNode().getOffset();
    }
    addMetadataField(documentData, TAGS_KEY, documentTags);
  }

  
  /**
   * Adds a new field to the metadata map saved by this class. 
   * @param key
   * @param value
   */
  protected void addMetadataField(DocumentData documentData, String key, 
          Serializable value){
    @SuppressWarnings("unchecked")
    HashMap<String, Serializable> myMetadata = 
      (HashMap<String, Serializable>)documentData.getMetadataField(
              getClass().getName());
    if(myMetadata == null){
      //this is the first time - let's add the map.
      myMetadata = new HashMap<String, Serializable>();
      documentData.putMetadataField(getClass().getName(), myMetadata);
    }
    myMetadata.put(key, value);
  }

  
  /**
   * Gets a metadata field value from the metadata map saved by this class. 
   * @param key
   * @param value
   */
  protected Serializable getMetadataField(DocumentData documentData, String key){
    @SuppressWarnings("unchecked")
    HashMap<String, Serializable> myMetadata = 
      (HashMap<String, Serializable>)documentData.getMetadataField(
              getClass().getName());
    return myMetadata == null ? null : myMetadata.get(key);
  }
  
  /**
   * Calculates the closing tag for a  given opening tag.
   * @param openingTag
   * @return
   */
  protected static String getClosingTag(String openingTag){
    StringBuilder closeTag = new StringBuilder("</");
    //are we inside the name?
    boolean inName = false;
    for(int charIdx = 0; charIdx < openingTag.length(); charIdx++){
      char currentChar = openingTag.charAt(charIdx);
      if(inName){
        //we're consuming non-space or > characters
        if(currentChar == ' ' || currentChar == '>'){
          //we're done!
          break;
        }else{
          closeTag.append(currentChar);
        }
      }else{
        //we're looking for the opening <
        if(currentChar == '<') inName = true;
      }
    }
    closeTag.append('>');
    return closeTag.toString();
  }
  
  /**
   * Gets the ID in the current list of tag descriptors for a given annotation
   * @param ann
   * @return
   */
  protected int getTagId(Annotation ann, DocumentTags documentMetadata){
    StringBuilder tagDesc = new StringBuilder("<");
    tagDesc.append(tagNameForAnnotation(ann));
    List<String> featNames = new ArrayList<String>();
    for(Map.Entry<Object, Object> entry : ann.getFeatures().entrySet()){
      if((entry.getKey() instanceof String) && (entry.getValue() != null)){
        featNames.add((String)entry.getKey());
      }
    }
    Collections.sort(featNames);
    for(String featName : featNames){
      String featValue = ann.getFeatures().get(featName).toString().replace("\"", """);
      tagDesc.append(' ');
      tagDesc.append(featName);
      tagDesc.append("=\"");
      tagDesc.append(featValue);
      tagDesc.append('"');
    }
    tagDesc.append(">");
    return documentMetadata.getTagDescriptorIndex(tagDesc.toString());
  }

  /**
   * Returns the tag name that should be used to represent the given
   * annotation.  This implementation simply returns the (trimmed)
   * annotation type, but subclasses may implement a more sophisticated
   * mapping.
   * 
   * @param ann an annotation
   * @return the tag name that should be used to represent the annotation
   */
  protected String tagNameForAnnotation(Annotation ann) {
    return ann.getType().trim();
  }
  
  
  /* (non-Javadoc)
   * @see gate.mimir.index.DocumentMetadataHelper#documentStart(gate.Document)
   */
  public void documentStart(GATEDocument document) {
    //we do nothing here
  }
  
  
  public static final String TAGS_KEY = "tags";
  
  /**
   * The names of the original tags that should be preserved as 
   * document metadata in the zip collection, and used for rendering documents.
   */
  public static final String[] DEFAULT_TAG_TYPES = new String[]{
    "b", "div", "i", "li", "ol", "p", "span", "sup", "sub", "table", "th", "td", "tr", "u", "ul"};

  /**
   * The tag used to mark-up query hits (opening tag).
   */
  public static final String HIT_OPENING_TAG = "<span class=\"mimir-hit\">";

  
  /**
   * The tag used to mark-up query hits (closing tag).
   */
  public static final String HIT_CLOSING_TAG = "</span>";
  
  /**
   * The types of annotations to be saved as markup metadata.
   */
  private Set<String> markupAnnTypes;
  
  /**
   * An object storing a list of tags (obtained from the original markup) that 
   * should be saved as document metadata. They are stored as triples of int 
   * values:
   * <ol>
   *   <li>the index in the {@link #tagDescriptors} array for the tag</li>
   *   <li>the start offset for the tag (in terms of token position);</li>
   *   <li>the end offset for the tag (in terms of token position); That is the
   *   position of the last token that is part of this tag.  Zero-length tags
   *   are represented by setting this position to -1.</li>
   * </ol>
   * 
   */
  protected static class DocumentTags implements Serializable{
    
    /**
     * Serialisation UID 
     */
    private static final long serialVersionUID = 5449290166356815305L;
    
    
    public DocumentTags(){
      tagDescriptorsSet = new HashSet<String>();
      tagDescriptors = new ArrayList<String>();
      tags = new ArrayList<int[]>();
    }
    
    /**
     * Gets the index in the {@link #tagDescriptors} list for a given tag 
     * descriptor. If no such descriptor is known, then a new one is added to 
     * the {@link #tagDescriptors} list and its index is returned. 
     * @param tagDescriptor
     * @return
     */
    public int getTagDescriptorIndex(String tagDescriptor){
      if(tagDescriptorsSet.add(tagDescriptor)){
        tagDescriptors.add(tagDescriptor);
        return tagDescriptors.size() -1;
      }else{
        return tagDescriptors.indexOf(tagDescriptor);
      }
    }
    
    
    @Override
    public String toString() {
      StringBuffer str = new StringBuffer();
      boolean first = true;
      for(int[] aTag : tags) {
        if(first) first = false;
        else str.append(' ');
        str.append(tagDescriptors.get(aTag[0])).append('(').append(aTag[1])
            .append(':').append(aTag[2]).append(')');
      }
      return str.toString();
    }
    
    /**
     * A set used internally to ensure uniqueness of the tag descriptors. 
     */
    private transient Set<String> tagDescriptorsSet;
    
    /**
     * A list of strings storing tag descriptors that are used to reduce the 
     * size of data stored as metadata.
     */
    private List<String> tagDescriptors;
    
    /**
     * A list of tags that are to be stored as document metadata. Each element 
     * is an array if 3 ints:
     * <ol>
     *   <li>the index in the {@link #tagDescriptors} array for the tag</li>
     *   <li>the start offset for the tag (in terms of token position);</li>
     *   <li>the end offset for the tag (in terms of token position);</li>
     * </ol>
     * 
     * This list is ordered based on the start offset, end offset and ID of the 
     * GATE annotations from which it was constructed, so that the tags should
     * appear in the correct document order. 
     */
    private List<int[]> tags;
    
    
  }
  
  /**
   * Compares annotation by start offset, end offset, and ID. Can be used to 
   * sort a list of markup tags in [a good approximation of] document order with
   * regard to their starting position.
   */
  protected static class StartComparator implements Comparator<Annotation>{

    /* (non-Javadoc)
     * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
     */
    public int compare(Annotation ann1, Annotation ann2) {
      long start1 = ann1.getStartNode().getOffset();
      long start2 = ann2.getStartNode().getOffset();
      if(start1 < start2){
        return -1;
      } else if(start1 > start2){
        return 1;
      }else{
        //same start offset
        long end1 = ann1.getEndNode().getOffset();
        long end2 = ann2.getEndNode().getOffset();
        if(end1 < end2){
          //first annotation ends sooner, so should start later
          return 1;
        }else if(end1 > end2){
          return -1;
        }else{
          //same end offset too -> used ann ID
         return ann1.getId() - ann2.getId(); 
        }
      }
    }
  }
}