FRBRGroupData.java example

Explorer

xtf-dsc-master
- WEB-INF
  - contrib
    - xtf-lucene
      - src
        java
        org
        apache
        lucene
        bigram
        BigramQueryRewriter.java
        BigramSpanRangeQuery.java
        BigramSpanWildcardQuery.java
        BigramStopFilter.java
        chunk
        Chunk.java
        ChunkMarkPos.java
        ChunkSource.java
        ChunkedWordIter.java
        DocNumMap.java
        SpanChunkedNotQuery.java
        SpanDechunkingQuery.java
        SparseStringComparator.java
        limit
        ExcessiveWorkException.java
        LimIndexReader.java
        LimTermDocs.java
        LimTermPositions.java
        TermLimitException.java
        mark
        BasicMarkPos.java
        BasicWordIter.java
        ContextMarker.java
        MarkCollector.java
        MarkPos.java
        WordIter.java
        search
        FieldSpanSource.java
        FlippableStringComparator.java
        QueryRewriter.java
        QueryTraverser.java
        RecordingSearcher.java
        SpanHitCollector.java
        spans
        EmptySpans.java
        FieldSpans.java
        NearSpans.java
        OrNearSpans.java
        Span.java
        SpanFirstQuery.java
        SpanNearQuery.java
        SpanNotNearQuery.java
        SpanNotQuery.java
        SpanOrNearQuery.java
        SpanOrQuery.java
        SpanPosComparator.java
        SpanQuery.java
        SpanRangeQuery.java
        SpanRecordingScorer.java
        SpanScorer.java
        SpanTermQuery.java
        SpanWeight.java
        SpanWildcardQuery.java
        Spans.java
        spell
        SpellKeywordTest.java
        spelt
        DoubleMetaphone.java
        FreqData.java
        LuceneIndexToDict.java
        MinimalAnalyzer.java
        QuerySpeller.java
        SimpleQueryRewriter.java
        SpellReader.java
        SpellTestCmdLine.java
        SpellWriter.java
        SpellWritingAnalyzer.java
        SpellWritingFilter.java
        TRStringDistance2.java
        WordEquiv.java
        util
        CountedInputStream.java
        CountedOutputStream.java
        FileSorter.java
        Hash64.java
        IntList.java
        LongList.java
        LongSet.java
        Prime.java
        PriorityQueue.java
        ProgressTracker.java
        RandomAccessInputStream.java
        StringUtil.java
      - test
        java
        org
        apache
        lucene
        spelt
        DoubleMetaphoneTest.java
        FreqDataTest.java
        LuceneIndexToDictTest.java
        QuerySpellerTest.java
        SimpleQueryRewriterTest.java
        SpellReadWriteTest.java
        SpellWritingAnalyzerTest.java
        TRStringDistance2Test.java
        util
        CountedInputStreamTest.java
        CountedOutputStreamTest.java
        FileSorterTest.java
        Hash64Test.java
        IntListTest.java
        LongListTest.java
        LongSetTest.java
        PrimeTest.java
        ProgressTrackerTest.java
        RandomAccessInputStreamTest.java
        StringUtilTest.java
  - src
    - net
      - sf
        saxon
        tinytree
        HackedTinyBuilder.java
        trans
        KeyManager.java
    - org
      - cdlib
        xtf
        cache
        Cache.java
        CacheDependency.java
        Dependency.java
        FileDependency.java
        GeneratingCache.java
        SimpleCache.java
        StringCache.java
        crossQuery
        CrossQuery.java
        CrossQueryConfig.java
        QueryRoute.java
        QueryRouteException.java
        TimeProfilingListener.java
        raw
        RawQuery.java
        test
        TestableCrossQuery.java
        dynaXML
        Authenticator.java
        DefaultDocLocator.java
        DocLocator.java
        DocRequest.java
        DynaXML.java
        DynaXMLConfig.java
        DynaXMLException.java
        InvalidDocumentException.java
        IpList.java
        NoPermissionException.java
        UnsupportedQueryException.java
        test
        TestableDynaXML.java
        lazyTree
        AncestorEnumeration.java
        AttributeEnumeration.java
        AttributeImpl.java
        ChildEnumeration.java
        DescendantEnumeration.java
        ElementImpl.java
        FastNodeTestPattern.java
        Flag.java
        FollowingEnumeration.java
        FollowingSiblingEnumeration.java
        LazyDocument.java
        LazyHashMap.java
        LazyKeyManager.java
        LazyProfilingListener.java
        LazyTreeBuilder.java
        NodeImpl.java
        ParentNodeImpl.java
        PersistentTree.java
        PrecedingEnumeration.java
        PrecedingOrAncestorEnumeration.java
        PrecedingSiblingEnumeration.java
        ProxyAttributeEnumeration.java
        ProxyAttributeImpl.java
        ProxyElement.java
        SearchElement.java
        SearchElementImpl.java
        SearchNode.java
        SearchTextImpl.java
        SearchTree.java
        TextImpl.java
        TreeEnumeration.java
        saxonExt
        ElementWithContent.java
        Exec.java
        Image.java
        InstructionWithContent.java
        Mail.java
        Pipe.java
        Redirect.java
        SQL.java
        exec
        ArgElement.java
        InputElement.java
        PipeImageElement.java
        PipeImageInstruction.java
        RunElement.java
        RunInstruction.java
        image
        ImageCache.java
        OutputElement.java
        mail
        SendElement.java
        pipe
        PipeBufferPool.java
        PipeFileElement.java
        PipeFopElement.java
        PipeRequestElement.java
        redirect
        HttpErrorElement.java
        RedirectElement.java
        sql
        SQLClose.java
        SQLColumn.java
        SQLConnect.java
        SQLDelete.java
        SQLInsert.java
        SQLProperty.java
        SQLQuery.java
        SQLUpdate.java
        servletBase
        CQLParseException.java
        DTDSuppressingXMLReader.java
        LatencyCutoffStream.java
        RedirectException.java
        SessionURLRewriter.java
        StylesheetCache.java
        TextConfig.java
        TextServlet.java
        test
        FakeOutputStream.java
        FakeServletConfig.java
        FakeServletContext.java
        FakeServletRequest.java
        FakeServletResponse.java
        NullOutputStream.java
        RegressTest.java
        textEngine
        AccentFoldingRewriter.java
        BoostSet.java
        BoostSetParams.java
        BoundedMarkPos.java
        BoundedWordIter.java
        ConfigCache.java
        Constants.java
        DefaultQueryProcessor.java
        DocHit.java
        DocHitImpl.java
        FlippingDirectory.java
        HitLoadException.java
        HitQueue.java
        IndexUtil.java
        IndexValidator.java
        IndexWarmer.java
        MoreLikeThisQuery.java
        NativeFSDirectory.java
        NumericFieldData.java
        NumericRangeQuery.java
        PluralFoldingRewriter.java
        QueryContext.java
        QueryGenException.java
        QueryProcessor.java
        QueryRequest.java
        QueryRequestParser.java
        QueryResult.java
        RefieldingQueryRewriter.java
        SlopFixupRewriter.java
        Snippet.java
        SnippetMaker.java
        SpanExactQuery.java
        SpanSectionTypeQuery.java
        SpellSuggRewriter.java
        SpellcheckParams.java
        SpellingSuggestion.java
        StdTermFilter.java
        StdTermRewriter.java
        TotalHitsComparator.java
        UnspanningQueryRewriter.java
        XtfBigramQueryRewriter.java
        XtfChunk.java
        XtfChunkMarkPos.java
        XtfChunkSource.java
        XtfChunkedWordIter.java
        XtfDocNumMap.java
        XtfLimIndexReader.java
        XtfQueryRewriter.java
        XtfQueryTraverser.java
        XtfSearcher.java
        XtfSpanRangeQuery.java
        XtfSpanWildcardQuery.java
        XtfWordEquiv.java
        facet
        ChildSelector.java
        DescendantSelector.java
        DocsSelector.java
        DynamicGroupData.java
        EmptySelector.java
        FRBRData.java
        FRBRGroupData.java
        FacetSpec.java
        GroupCounts.java
        GroupData.java
        GroupSelector.java
        MarkSelector.java
        NameSelector.java
        PageSelector.java
        ParseException.java
        RangeSelector.java
        ResultFacet.java
        ResultGroup.java
        RootSelector.java
        SelectedSelector.java
        SelectorParser.java
        SelectorParserConstants.java
        SelectorParserTokenManager.java
        SiblingSelector.java
        SimpleCharStream.java
        SingletonSelector.java
        StaticGroupData.java
        Token.java
        TokenMgrError.java
        TopChoiceSelector.java
        UnionSelector.java
        freeform
        CharStream.java
        FreeformQueryParser.java
        FreeformQueryParserConstants.java
        FreeformQueryParserTokenManager.java
        ParseException.java
        SimpleCharStream.java
        Token.java
        TokenMgrError.java
        textIndexer
        AccentFoldingFilter.java
        CrimsonBugWorkaround.java
        DocSelCache.java
        FacetTokenizer.java
        HTMLIndexSource.java
        HTMLToString.java
        IdxTreeCleaner.java
        IdxTreeCuller.java
        IdxTreeDictMaker.java
        IdxTreeOptimizer.java
        IndexDump.java
        IndexInfo.java
        IndexMerge.java
        IndexRecord.java
        IndexSource.java
        IndexStats.java
        IndexSync.java
        IndexerConfig.java
        MARCIndexSource.java
        MSWordIndexSource.java
        PDFIndexSource.java
        PDFToString.java
        PluralFoldingFilter.java
        SectionInfo.java
        SectionInfoStack.java
        SpellWritingFilter.java
        SrcTreeProcessor.java
        StartEndFilter.java
        StructuredFileProxy.java
        TagFilter.java
        TextIndexSource.java
        TextIndexer.java
        TextIndexerException.java
        XMLConfigParser.java
        XMLIndexSource.java
        XMLTextProcessor.java
        XTFTextAnalyzer.java
        XtfSpecialTokensFilter.java
        tokenizer
        CharStream.java
        FastCharStream.java
        ParseException.java
        Token.java
        TokenMgrError.java
        Tokenizer.java
        XTFTokenizer.java
        XTFTokenizerConstants.java
        XTFTokenizerTokenManager.java
        util
        ArrayUtil.java
        Attrib.java
        AttribList.java
        Base64.java
        CharMap.java
        CheckingTokenStream.java
        CircularQueue.java
        ConsecutiveMap.java
        DirSync.java
        DiskHashReader.java
        DiskHashWriter.java
        DocTypeDeclRemover.java
        EasyNode.java
        EmbeddedList.java
        FastIntCache.java
        FastStringCache.java
        FastStringReader.java
        FastTokenizer.java
        FileWalker.java
        FloatList.java
        GeneralException.java
        IntHash.java
        IntMultiMap.java
        LimitedOutputStream.java
        LineReader.java
        Linkable.java
        LinkableImpl.java
        Normalizer.java
        PackedByteBuf.java
        Path.java
        ProcessRunner.java
        StringHash.java
        StructuredFile.java
        StructuredStore.java
        SubDirFilter.java
        SubFileReader.java
        SubFileWriter.java
        SubStoreReader.java
        SubStoreWriter.java
        TagArray.java
        TagChars.java
        Tester.java
        ThreadWatcher.java
        Trace.java
        TraceWriter.java
        WordMap.java
        XMLFormatter.java
        XMLWriter.java
        XTFSaxonErrorListener.java
        xslt
        CharUtils.java
        FileUtils.java
        FreeformQuery.java
        Session.java
        XMLStubReader.java
        zing
        SRU.java
        SRUConfig.java

package org.cdlib.xtf.textEngine.facet;


/**
 * Copyright (c) 2006, Regents of the University of California
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 * - Neither the name of the University of California nor the names of its
 *   contributors may be used to endorse or promote products derived from this
 *   software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Acknowledgements:
 *
 * A significant amount of new and/or modified code in this module
 * was made possible by a grant from the Andrew W. Mellon Foundation,
 * as part of the Melvyl Recommender Project.
 */
import java.io.IOException;
import java.util.ArrayList;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.IntList;
import org.apache.lucene.util.Prime;
import org.cdlib.xtf.util.FloatList;
import org.cdlib.xtf.util.TagChars;
import org.cdlib.xtf.util.Trace;

/**
 * Implements a dynamic mapping from document to a FRBR-style title/author key.
 *
 * @author Martin Haye
 */
public class FRBRGroupData extends DynamicGroupData 
{
  /** Original parameter string */
  @SuppressWarnings("unused")
  private String params;

  /** Tag/doc data for the specified fields */
  private FRBRData data;

  /** IDs of matching documents */
  private IntList docs = new IntList();

  /** Highest doc ID encountered */
  private int maxDoc = 0;

  /** Score of each matching document */
  private FloatList docScores = new FloatList();

  /** Mapping of documents to groups */
  private IntList docGroups;

  /** First document in each group (for sorting purposes) */
  private IntList groupDocs;

  /** Number of documents in each group */
  private IntList groupDocCounts;

  /** Score of each group */
  private FloatList groupScores;

  /** Number of groups created so far */
  private int nGroups = 1; // group 0 is always the root

  /** Primary field to sort by */
  private int primarySort = FRBRData.TYPE_TITLE;

  /** Whether primary sort is in reverse order */
  private boolean reversePrimarySort = false;

  /**
   * Read in the FRBR data for the a delimited list of fields.
   */
  public void init(IndexReader indexReader, Set tokFields, String params)
    throws IOException 
  {
    // Record the input
    this.params = params;

    // Break the string of parameters into a list of fields.
    StringTokenizer t = new StringTokenizer(params, " \t,;|");
    ArrayList<String> fields = new ArrayList<String>(t.countTokens());
    while (t.hasMoreTokens()) 
    {
      String tok = t.nextToken();
      if (tok.startsWith("[")) 
      {
        if (tok.equals("[sort=title]"))
          primarySort = FRBRData.TYPE_TITLE;
        else if (tok.equals("[sort=author]"))
          primarySort = FRBRData.TYPE_AUTHOR;
        else if (tok.equals("[sort=date]"))
          primarySort = FRBRData.TYPE_DATE;
        else if (tok.equals("[sort=-date]")) {
          primarySort = FRBRData.TYPE_DATE;
          reversePrimarySort = true;
        }
        else if (tok.equals("[sort=id]"))
          primarySort = FRBRData.TYPE_ID;
        else
          throw new RuntimeException("Unknown control marker: " + tok);
      }
      else 
      {
        // Our algorithms fail badly on tokenized fields, so flag that.
        if (tokFields.contains(tok))
          throw new RuntimeException("XTF's FRBR algorithms cannot work with tokenized fields, e.g. '" + tok + "'");
        
        // Field is okay, add it to our list.
        fields.add(tok);
      }
    }

    // And fetch the doc/tag data for those fields.
    data = FRBRData.getCachedTags(indexReader, fields.toArray(new String[fields.size()]));
  }

  /**
   * Add a document (that matched the query) to our data.
   */
  public void collect(int doc, float score) {
    assert docs.isEmpty() || docs.getLast() < doc : "docs out of order";
    docs.add(doc);
    docScores.add(score);
    maxDoc = Math.max(maxDoc, doc);
  } // collect()

  /**
   * Form the final FRBR groups for the document set.
   */
  public void finish() 
  {
    Trace.debug("Building FRBR groups for " + docs.size() + " docs...");
    Trace.tab();

    // Save space in the document and score lists.
    docs.compact();
    docScores.compact();

    // Figure out a group for each document.
    docGroups = new IntList(maxDoc + 1);
    docGroups.fill(-1);
    for (int i = 0; i < docs.size(); i++) 
    {
      int doc = docs.get(i);

      // Skip docs that already have a group assigned.
      if (docGroups.get(doc) >= 0)
        continue;

      // Go looking...
      findGroup(doc);
    }

    Trace.debug(nGroups + " groups. Inverting map...");

    // Form the count and score lists.
    groupDocs = new IntList(nGroups);
    groupDocCounts = new IntList(nGroups);
    groupScores = new FloatList(nGroups);
    for (int i = 0; i < docs.size(); i++) 
    {
      int doc = docs.get(i);
      float score = docScores.get(i);
      int group = docGroups.get(doc);
      assert group >= 0 : "group should have been assigned";

      if (groupDocs.get(group) == 0)
        groupDocs.set(group, doc);

      groupDocCounts.set(group, groupDocCounts.get(group) + 1);

      groupScores.set(group, Math.max(groupScores.get(group), score));
      groupScores.set(0, Math.max(groupScores.get(0), score));
    }
    groupDocCounts.set(0, docs.size());

    Trace.debug("Done.");
    Trace.untab();
  } // finish()

  /**
   * Figure out a group to put the document in. If it matches other documents,
   * the group will contain all of them; otherwise, it'll be a singleton.
   *
   * @param mainDoc     Document to put into a group
   */
  private void findGroup(int mainDoc) 
  {
    // This document will be its own group, but hopefully we can add more
    // documents to that group.
    //
    docGroups.set(mainDoc, nGroups++);

    // Our starting point is the title(s) of the current document.
    for (int pos = data.docTags.firstPos(mainDoc); pos >= 0;
         pos = data.docTags.nextPos(pos)) 
    {
      int mainTitle = data.docTags.getValue(pos);
      if (data.tags.getType(mainTitle) != FRBRData.TYPE_TITLE)
        continue;

      // Scan forward looking for matching titles. Do compare the main title,
      // since other documents may match that title exactly.
      //
      int compTitle = mainTitle;
      while (compTitle >= 0) {
        if (!matchOnTitle(mainDoc, mainTitle, compTitle))
          break;
        compTitle = data.tags.next(compTitle);
      }

      // Scan backward through the titles in like manner.
      compTitle = data.tags.prev(mainTitle);
      while (compTitle >= 0) {
        if (!matchOnTitle(mainDoc, mainTitle, compTitle))
          break;
        compTitle = data.tags.prev(compTitle);
      }
    } // for title
  } // findGroup()

  /**
   * Determines if the two titles match enough to warrant further examination,
   * and if so, continues the matching process on documents from the
   * comparable title.
   *
   * @param mainDoc       main document being matched
   * @param mainTitle     main doc's title tag
   * @param compTitle     title tag to compare
   * @return              true if title iteration should continue.
   */
  private boolean matchOnTitle(int mainDoc, int mainTitle, int compTitle) 
  {
    // If they don't match exactly, check for match before colon. If that
    // doesn't match either, stop the iteration.
    ///
    if (mainTitle != compTitle && !matchPartialTitle(mainTitle, compTitle))
      return false;

    // Okay, iterate all the documents that match on title (except the main
    // doc which of course matches itself.)
    //
    for (int pos = data.tagDocs.firstPos(compTitle); pos >= 0;
         pos = data.tagDocs.nextPos(pos)) 
    {
      int compDoc = data.tagDocs.getValue(pos);
      if (compDoc == mainDoc)
        continue;

      // If the document isn't in our query set, skip it.
      if (docs.binarySearch(compDoc) < 0)
        continue;

      // If it's already in a group, skip it (hopefully this is rare)
      if (docGroups.get(compDoc) >= 0) 
      {
        if (docGroups.get(compDoc) != docGroups.get(mainDoc)) 
        {
          // hopefully rare
        }
        continue;
      }
      
      // See if it's close enough to call it a match.
      if (!multiFieldMatch(mainDoc, compDoc))
        continue;

      // Okay, we got a live one. Put it in the same group as the main doc.
      int group = docGroups.get(mainDoc);
      docGroups.set(compDoc, group);
    }
    
    // Continue title iteration, since the title matched (even if no docs 
    // matched).
    //
    return true;
  } // matchOnTitle()

  // Instance variables to avoid re-allocation for each iteration.
  private IntList matchTags1 = new IntList();
  private IntList matchTags2 = new IntList();

  /**
   * Compare the fields of two documents to determine if they should be in
   * the same FRBR group.
   *
   * @param doc1     First document
   * @param doc2     Second document
   * @return            true if they're equivalent
   */
  private boolean multiFieldMatch(int doc1, int doc2) 
  {
    int titleScore = 0;
    int authorScore = 0;
    int dateScore = 0;
    int idScore = 0;

    int p1 = data.docTags.firstPos(doc1);
    int tag1 = (p1 >= 0) ? data.docTags.getValue(p1) : -1;
    int type1 = (p1 >= 0) ? data.tags.getType(tag1) : 99;

    int p2 = data.docTags.firstPos(doc2);
    int tag2 = (p2 >= 0) ? data.docTags.getValue(p2) : -1;
    int type2 = (p2 >= 0) ? data.tags.getType(tag2) : 99;

    // Iterate through each type in turn
    while (p1 >= 0 || p2 >= 0) 
    {
      // Pick the next available type to work on.
      int curType = Math.min(type1, type2);
      assert curType != 99;

      // Collect tags from the first doc for the current type.
      matchTags1.clear();
      while (type1 == curType) {
        matchTags1.add(tag1);
        p1 = data.docTags.nextPos(p1);
        tag1 = (p1 >= 0) ? data.docTags.getValue(p1) : -1;
        type1 = (p1 >= 0) ? data.tags.getType(tag1) : 99;
      }

      // Collect tags from the second doc for the same type.
      matchTags2.clear();
      while (type2 == curType) {
        matchTags2.add(tag2);
        p2 = data.docTags.nextPos(p2);
        tag2 = (p2 >= 0) ? data.docTags.getValue(p2) : -1;
        type2 = (p2 >= 0) ? data.tags.getType(tag2) : 99;
      }

      // And calculate an appropriate score.
      switch (curType) 
      {
        case FRBRData.TYPE_TITLE:
          debugFieldMatch("title", doc1, doc2);
          titleScore = scoreTitleMatch(matchTags1, matchTags2);
          break;
        case FRBRData.TYPE_AUTHOR:
          debugFieldMatch("author", doc1, doc2);
          authorScore = scoreAuthorMatch(matchTags1, matchTags2);
          break;
        case FRBRData.TYPE_DATE:
          debugFieldMatch("date", doc1, doc2);
          dateScore = scoreDateMatch(matchTags1, matchTags2);
          break;
        case FRBRData.TYPE_ID:
          debugFieldMatch("id", doc1, doc2);
          idScore = scoreIdMatch(matchTags1, matchTags2);
          break;
      }
    } // while
    assert p1 < 0 && p2 < 0;

    // Is the total score high enough?
    int totalScore = titleScore + authorScore + dateScore + idScore;

    //if (totalScore >= 150) {
    if (false) {
      outputDisplayKey("Match: ", doc1);
      outputDisplayKey("   vs: ", doc2);
      Trace.debug(
        "     = " + titleScore + "t + " + authorScore + "a + " + dateScore +
        "d + " + idScore + "i = " + totalScore);
    }

    if (totalScore < 150)
      return false;

    return true;
  }

  private void debugFieldMatch(String field, int doc1, int doc2) 
  {
    if (true || Trace.getOutputLevel() != Trace.debug)
      return;
    Trace.debug("Match " + field + ":");
    Trace.tab();

    Trace.debug("Doc " + doc1);
    Trace.tab();
    for (int i = 0; i < matchTags1.size(); i++)
      Trace.debug(
        data.tags.getString(matchTags1.get(i)) + " {tag=" + matchTags1.get(i) +
        "}");

    Trace.untab();
    Trace.debug("Doc " + doc2);
    Trace.tab();
    for (int i = 0; i < matchTags2.size(); i++)
      Trace.debug(
        data.tags.getString(matchTags2.get(i)) + " {tag=" + matchTags2.get(i) +
        "}");

    Trace.untab();
    Trace.untab();
  }

  private void outputDisplayKey(String title, int doc) 
  {
    int nToSkip = 0;
    int[] fieldMax = { 0, 50, 40, 4, 30 };
    final String spaces = "                                                             ";

    int found = 0;
    do 
    {
      StringBuffer buf = new StringBuffer();
      found = 0;
      for (int t = FRBRData.FIRST_TYPE; t <= FRBRData.LAST_TYPE; t++) 
      {
        int skipped = 0;
        String value = "";
        for (int pos = data.docTags.firstPos(doc); pos >= 0;
             pos = data.docTags.nextPos(pos)) 
        {
          int tag = data.docTags.getValue(pos);
          int type = data.tags.getType(tag);
          int subType = data.tags.getSubType(tag);

          if (type != t)
            continue;
          if (skipped++ == nToSkip) {
            value = data.tags.getString(tag) + " [" + subType + "]";
            found++;
          }
        }

        int lenToKeep = Math.min(value.length(), fieldMax[t]);
        if (buf.length() > 0)
          buf.append(" | ");
        buf.append(value.substring(0, lenToKeep) +
                   spaces.substring(0, fieldMax[t] - lenToKeep));
      } // for

      if (found > 0 || nToSkip == 0) {
        Trace.debug(title + buf);
        title = spaces.substring(0, title.length());
        ++nToSkip;
      }
    } while (found > 0);
  } // outputDisplayKey()

  private TagChars chars1 = new TagChars();
  private TagChars chars2 = new TagChars();

  /**
   * Score the potential match of two lists of titles.
   */
  private int scoreTitleMatch(IntList list1, IntList list2) 
  {
    // If both lists are empty, it's no foul, no score.
    if (list1.isEmpty() && list2.isEmpty())
      return 0;

    // See how many match exactly, and how many we need to skip.
    int p1 = 0;

    // See how many match exactly, and how many we need to skip.
    int p2 = 0;
    final int size1 = list1.size();
    final int size2 = list2.size();
    int nMatches = 0;
    int skipped1 = 0;
    int skipped2 = 0;
    int maxScore = 100;
    while (p1 < size1 && p2 < size2) 
    {
      int tag1 = list1.get(p1);
      int tag2 = list2.get(p2);
      int subType1 = data.tags.getSubType(tag1);
      int subType2 = data.tags.getSubType(tag2);

      // If they match exactly, advance.
      if (subType1 == subType2) 
      {
        if (tag1 == tag2) {
          ++nMatches;
          ++p1;
          ++p2;
          continue;
        }

        // If they match before a colon, advance.
        if (matchPartialTitle(tag1, tag2)) {
          ++nMatches;
          ++p1;
          ++p2;
          maxScore = 80;
          continue;
        }
      }

      // Okay, figure out which one to skip.
      if (tag1 < tag2) {
        ++skipped1;
        ++p1;
      }
      else {
        ++skipped2;
        ++p2;
      }
    }
    skipped1 += (size1 - p1);
    skipped2 += (size2 - p2);

    // Are the lists identical?
    if (skipped1 == 0 && skipped2 == 0) {
      assert nMatches > 0;
      return maxScore;
    }

    // Is one a subset of the other?
    if (nMatches > 0 && (skipped1 == 0 || skipped2 == 0))
      return 80;

    // Okay, even if there were some matches, there was at least one mismatch.
    return -100;
  } // scoreTitleMatch()

  /**
   * Check if one title matches the other without a colon.
   */
  private boolean matchPartialTitle(int tag1, int tag2) 
  {
    data.tags.getChars(tag1, chars1);
    data.tags.getChars(tag2, chars2);

    // If at least 10 chars don't match, don't even try.
    int prefixMatch = chars1.prefixMatch(chars2);
    if (prefixMatch < 10)
      return false;

    // Which one has the colon?
    int colonPos = chars1.indexOf(':');
    if (colonPos >= 10)
      return prefixMatch == chars2.length() && prefixMatch >= colonPos;

    colonPos = chars2.indexOf(':');
    if (colonPos >= 10)
      return prefixMatch == chars1.length() && prefixMatch >= colonPos;

    return false;
  }

  /**
   * Score the potential match of two lists of authors.
   */
  private int scoreAuthorMatch(IntList list1, IntList list2) 
  {
    // If both lists are empty, consider that a bit of good.
    if (list1.isEmpty() && list2.isEmpty())
      return 75;

    // See how many match exactly, and how many we have to skip.
    int p1 = 0;

    // See how many match exactly, and how many we have to skip.
    int p2 = 0;
    final int size1 = list1.size();
    final int size2 = list2.size();
    int nMatches = 0;
    int skipped1 = 0;
    int skipped2 = 0;
    int maxScore = 100;
    while (p1 < size1 && p2 < size2) 
    {
      int tag1 = list1.get(p1);
      int tag2 = list2.get(p2);
      int subType1 = data.tags.getSubType(tag1);
      int subType2 = data.tags.getSubType(tag2);

      // If they match exactly, advance.
      if (subType1 == subType2) 
      {
        if (tag1 == tag2) {
          ++nMatches;
          ++p1;
          ++p2;
          continue;
        }

        // If they match out-of-order, advance.
        if (matchPartialAuthor(tag1, tag2)) {
          ++nMatches;
          ++p1;
          ++p2;
          maxScore = 80;
          continue;
        }
      }

      // Okay, figure out which one to skip.
      if (tag1 < tag2) {
        ++skipped1;
        ++p1;
      }
      else {
        ++skipped2;
        ++p2;
      }
    }
    skipped1 += (size1 - p1);
    skipped2 += (size2 - p2);

    // Are the lists identical?
    if (skipped1 == 0 && skipped2 == 0) {
      assert nMatches > 0;
      return maxScore;
    }

    // Is one a subset of the other?
    if (nMatches > 0 && (skipped1 == 0 || skipped2 == 0))
      return 80;

    // Okay, even if there were some matches, there was at least one mismatch.
    return -100;
  } // scoreAuthorMatch()

  private int wordHashKey = 0;
  private static final int WORD_HASH_SIZE = Prime.findAfter(1000000);
  private int[] wordHash = new int[WORD_HASH_SIZE];
  private static final char[] charType = new char[0x10000];

  static 
  {
    // Whitespace
    charType[' '] = 'p';
    charType['\t'] = 'p';
    charType['\n'] = 'p';
    charType['\r'] = 'p';
    charType['\f'] = 'p';

    // Punctuation
    charType['\''] = 'p';
    charType['"'] = 'p';
    charType['.'] = 'p';
    charType['&'] = 'p';
    charType['@'] = 'p';
    charType['-'] = 'p';
    charType['/'] = 'p';
    charType[','] = 'p';
    charType[':'] = 'p';
    charType[';'] = 'p';
    charType['('] = 'p';
    charType[')'] = 'p';
    charType['['] = 'p';
    charType[']'] = 'p';
  };

  /**
   * Compare two author names to see if the keywords from one are completely
   * contained within the other.
   */
  private boolean matchPartialAuthor(int tag1, int tag2) 
  {
    // Pick the longer one to start with
    data.tags.getChars(tag1, chars1);
    data.tags.getChars(tag2, chars2);

    if (chars2.length() > chars1.length()) 
    {
      int tmp = tag1;
      tag1 = tag2;
      tag2 = tmp;

      TagChars cTmp = chars1;
      chars1 = chars2;
      chars2 = cTmp;
    }

    // Advance to the next key value, so we can distinguish old hash values
    // from new ones.
    //
    ++wordHashKey;

    // Add all the words from the first author to the hash
    int i = 0;
    while (i < chars1.length()) 
    {
      int hashCode = 0;
      int nChars = 0;
      for (; i < chars1.length(); i++) 
      {
        char c = chars1.charAt(i);
        if (charType[c] == 'p') {
          i++;
          break;
        }
        hashCode = (hashCode * 31) + c;
        ++nChars;
      }

      if (hashCode != 0 && nChars > 3)
        wordHash[(hashCode & 0x7FFFFFFF) % WORD_HASH_SIZE] = wordHashKey;
    }

    // Now check all the words from the second (shorter) author to see if 
    // they're present
    //
    i = 0;
    int nWords2 = 0;
    int nMatch2 = 0;
    while (i < chars2.length()) 
    {
      int hashCode = 0;
      int nChars = 0;
      for (; i < chars2.length(); i++) 
      {
        char c = chars2.charAt(i);
        if (charType[c] == 'p') {
          i++;
          break;
        }
        hashCode = (hashCode * 31) + c;
        ++nChars;
      }

      if (hashCode != 0 && nChars > 3) {
        ++nWords2;
        if (wordHash[(hashCode & 0x7FFFFFFF) % WORD_HASH_SIZE] == wordHashKey)
          ++nMatch2;
      }
    } // while

    // If all the words from the shorter author matched (and there were at least
    // two words found), call it good.
    return (nWords2 == nMatch2 && nWords2 >= 2);
  } // matchPartialAuthor()

  /**
   * Compare two dates for a match.
   */
  @SuppressWarnings("unused")
  private int scoreDateMatch(IntList list1, IntList list2) 
  {
    // If no date, don't consider it a problem.
    if (list1.isEmpty() || list2.isEmpty())
      return 0;

    // Since at the moment we're using sort-year, there should be only one.
    assert list1.size() == 1;
    assert list2.size() == 1;

    int tag1 = list1.get(0);
    int tag2 = list2.get(0);

    // If they're exactly equal, great.
    if (tag1 == tag2)
      return 50;

    // Parse the years
    data.tags.getChars(tag1, chars1);
    data.tags.getChars(tag2, chars2);
    int year1 = parseYear(chars1);
    int year2 = parseYear(chars2);

    // If either is missing, no match.
    if (year1 < 0 || year2 < 0)
      return 0;
    
    // If the years are equal, considert that only slightly bad.
    if (year1 == year2)
      return -20;

    // If not equal but still within 2 years, that's a bit worse.
    if (Math.abs(year1 - year2) <= 2)
      return -40;

    // All other cases: no match.
    return -60;
  } // scoreDateMatch

  /**
   * Search characters for a series of 4 digits, and consider that a year.
   */
  private int parseYear(TagChars chars) 
  {
    int num = 0;
    for (int i=0; i<chars.length(); i++) {
      char ch = chars.charAt(i);
      if (ch >= '0' && ch <= '9') {
        num = (num * 10) + (ch - '0');
        if (num > 1800 && num < 2100)
          return num;
      }
      else
        num = 0;
    }
    return -99;
  }

  /**
   * Score the potential match of two lists of identifiers.
   */
  private int scoreIdMatch(IntList list1, IntList list2) 
  {
    // If both lists are empty, it's no foul, no score.
    if (list1.isEmpty() && list2.isEmpty())
      return 0;

    // See how many match exactly, and how many we need to skip.
    int p1 = 0;
    int p2 = 0;
    final int size1 = list1.size();
    final int size2 = list2.size();
    int nMatches = 0;
    int skipped1 = 0;
    int skipped2 = 0;
    int maxScore = 100;
    while (p1 < size1 && p2 < size2) 
    {
      int tag1 = list1.get(p1);
      int tag2 = list2.get(p2);
      int subType1 = data.tags.getSubType(tag1);
      int subType2 = data.tags.getSubType(tag2);

      // If they match exactly, advance.
      if (subType1 == subType2) 
      {
        if (tag1 == tag2) {
          ++nMatches;
          ++p1;
          ++p2;
          continue;
        }

        // If they match before a paren, advance.
        if (matchPartialId(tag1, tag2)) {
          ++nMatches;
          ++p1;
          ++p2;
          maxScore = 80;
          continue;
        }
      }

      // Okay, figure out which one to skip.
      if (tag1 < tag2) {
        ++skipped1;
        ++p1;
      }
      else {
        ++skipped2;
        ++p2;
      }
    }
    skipped1 += (size1 - p1);
    skipped2 += (size2 - p2);

    // Are the lists identical?
    if (skipped1 == 0 && skipped2 == 0) {
      assert nMatches > 0;
      return maxScore;
    }

    // Is one a subset of the other?
    if (nMatches > 0 && (skipped1 == 0 || skipped2 == 0))
      return 80;

    // Okay, even if there were some matches, there was at least one mismatch.
    // This is pretty common with identifiers, so don't count this as a 
    // negative.
    //
    return 0;
  } // scoreIdMatch()

  /**
   * Check if two identifiers match before parentheses
   */
  private boolean matchPartialId(int tag1, int tag2) 
  {
    data.tags.getChars(tag1, chars1);
    data.tags.getChars(tag2, chars2);

    // If at least 6 chars don't match, don't even try.
    int prefixMatch = chars1.prefixMatch(chars2);
    if (prefixMatch < 6)
      return false;

    // Which one has the parenthesis?
    int parenPos = chars1.indexOf('(');
    if (parenPos >= 6)
      return prefixMatch == chars2.length() && prefixMatch >= parenPos;

    parenPos = chars2.indexOf('(');
    if (parenPos >= 6)
      return prefixMatch == chars1.length() && prefixMatch >= parenPos;

    return false;
  }

  /**
   * Get the field name (synthetic in our case)
   */
  public String field() {
    return "dynamicFRBR";
  }

  // inherit JavaDoc
  public String name(int groupId) {
    return "group-" + groupId;
  }

  // inherit JavaDoc
  public int findGroup(String name) {
    if (!name.startsWith("group-"))
      return -1;
    return Integer.parseInt(name.substring("group-".length()));
  }

  // inherit JavaDoc
  public int child(int groupId) {
    return (groupId == 0 && nGroups > 1) ? 1 : -1;
  }

  // inherit JavaDoc
  public int sibling(int groupId) {
    return (groupId == 0 || groupId == nGroups - 1) ? -1 : (groupId + 1);
  }

  // inherit JavaDoc
  public int parent(int groupId) {
    return (groupId == 0) ? -1 : 0;
  }

  // inherit JavaDoc
  public int nChildren(int groupId) {
    return (groupId == 0) ? (nGroups - 1) : 0;
  }

  // inherit JavaDoc
  public int firstLink(int docId) {
    return docGroups.get(docId);
  }

  // inherit JavaDoc
  public int nextLink(int linkId) {
    return -1;
  }

  // inherit JavaDoc
  public int linkGroup(int linkId) {
    return linkId;
  }

  // inherit JavaDoc
  public int nGroups() {
    return nGroups;
  }

  // inherit JavaDoc
  public boolean isDynamic() {
    return true;
  }

  // inherit JavaDoc
  public int nDocHits(int groupId) {
    return groupDocCounts.get(groupId);
  }

  // inherit JavaDoc
  public float score(int groupId) {
    return groupScores.get(groupId);
  }

  // inherit JavaDoc
  public final int compare(int group1, int group2) 
  {
    // Are they exactly equal?
    if (group1 == group2)
      return 0;

    // Get the first document in each group.
    int doc1 = groupDocs.get(group1);
    int doc2 = groupDocs.get(group2);

    // First, compare the primary field.
    int x;
    if ((x = compareField(primarySort, doc1, doc2, reversePrimarySort)) != 0)
      return x;

    // Now compare the secondary fields, in order.
    for (int t = FRBRData.FIRST_TYPE; t <= FRBRData.LAST_TYPE; ++t) {
      if (t != primarySort && (x = compareField(t, doc1, doc2, false)) != 0)
        return x;
    }

    // No differences found.
    return 0;
  }

  /** Find the title of a document */
  @SuppressWarnings("unused")
  private String docTitle(int doc) {
    for (int pos = data.docTags.firstPos(doc); pos >= 0;
         pos = data.docTags.nextPos(pos)) {
      int tag = data.docTags.getValue(pos);
      int type = data.tags.getType(tag);
      if (type != FRBRData.TYPE_TITLE)
        continue;
      return data.tags.getString(tag);
    }

    return "";
  }

  /** Compare a particular field of two groups */
  private int compareField(int type, int doc1, int doc2, boolean reverse) 
  {
    // Locate this field in the first doc.
    int tag1 = 0;
    for (int pos = data.docTags.firstPos(doc1); pos >= 0 && tag1 == 0;
         pos = data.docTags.nextPos(pos)) {
      int tag = data.docTags.getValue(pos);
      if (data.tags.getType(tag) == type)
        tag1 = tag;
    }

    // ... and locate it in the second doc.
    int tag2 = 0;
    for (int pos = data.docTags.firstPos(doc2); pos >= 0 && tag2 == 0;
         pos = data.docTags.nextPos(pos)) {
      int tag = data.docTags.getValue(pos);
      if (data.tags.getType(tag) == type)
        tag2 = tag;
    }

    // Make sure docs that don't have an entry sort at the end, not the beginning.
    if (tag1 == 0)
      tag1 = reverse ? Integer.MIN_VALUE : Integer.MAX_VALUE;
    if (tag2 == 0)
      tag2 = reverse ? Integer.MIN_VALUE : Integer.MAX_VALUE;

    // Now a simple numerical comparison on the tags will do.
    if (reverse)
      return (tag1 < tag2) ? +1 : ((tag1 > tag2) ? -1 : 0);
    else
      return (tag1 < tag2) ? -1 : ((tag1 > tag2) ? +1 : 0);
  } // compareField
} // class FRBRGroupData