TestTokenInfoDictionary.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja.dict;


import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntsRefFSTEnum;
import org.apache.lucene.util.fst.IntsRefFSTEnum.InputOutput;

public class TestTokenInfoDictionary extends LuceneTestCase {

  /** enumerates the entire FST/lookup data and just does basic sanity checks */
  public void testEnumerateAll() throws Exception {
    // just for debugging
    int numTerms = 0;
    int numWords = 0;
    int lastWordId = -1;
    int lastSourceId = -1;
    TokenInfoDictionary tid = TokenInfoDictionary.getInstance();
    ConnectionCosts matrix = ConnectionCosts.getInstance();
    FST<Long> fst = tid.getFST().getInternalFST();
    IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<>(fst);
    InputOutput<Long> mapping;
    IntsRef scratch = new IntsRef();
    while ((mapping = fstEnum.next()) != null) {
      numTerms++;
      IntsRef input = mapping.input;
      char chars[] = new char[input.length];
      for (int i = 0; i < chars.length; i++) {
        chars[i] = (char)input.ints[input.offset+i];
      }
      assertTrue(UnicodeUtil.validUTF16String(new String(chars)));
      
      Long output = mapping.output;
      int sourceId = output.intValue();
      // we walk in order, terms, sourceIds, and wordIds should always be increasing
      assertTrue(sourceId > lastSourceId);
      lastSourceId = sourceId;
      tid.lookupWordIds(sourceId, scratch);
      for (int i = 0; i < scratch.length; i++) {
        numWords++;
        int wordId = scratch.ints[scratch.offset+i];
        assertTrue(wordId > lastWordId);
        lastWordId = wordId;
         
        String baseForm = tid.getBaseForm(wordId, chars, 0, chars.length);
        assertTrue(baseForm == null || UnicodeUtil.validUTF16String(baseForm));
        
        String inflectionForm = tid.getInflectionForm(wordId);
        assertTrue(inflectionForm == null || UnicodeUtil.validUTF16String(inflectionForm));
        if (inflectionForm != null) {
          // check that it's actually an ipadic inflection form
          assertNotNull(ToStringUtil.getInflectedFormTranslation(inflectionForm));          
        }
        
        String inflectionType = tid.getInflectionType(wordId);
        assertTrue(inflectionType == null || UnicodeUtil.validUTF16String(inflectionType));
        if (inflectionType != null) {
          // check that it's actually an ipadic inflection type
          assertNotNull(ToStringUtil.getInflectionTypeTranslation(inflectionType));
        }
        
        int leftId = tid.getLeftId(wordId);
        int rightId = tid.getRightId(wordId);
        
        matrix.get(rightId, leftId);
        
        tid.getWordCost(wordId);
        
        String pos = tid.getPartOfSpeech(wordId);
        assertNotNull(pos);
        assertTrue(UnicodeUtil.validUTF16String(pos));
        // check that it's actually an ipadic pos tag
        assertNotNull(ToStringUtil.getPOSTranslation(pos));
        
        String pronunciation = tid.getPronunciation(wordId, chars, 0, chars.length);
        assertNotNull(pronunciation);
        assertTrue(UnicodeUtil.validUTF16String(pronunciation));
        
        String reading = tid.getReading(wordId, chars, 0, chars.length);
        assertNotNull(reading);
        assertTrue(UnicodeUtil.validUTF16String(reading));
      }
    }
    if (VERBOSE) {
      System.out.println("checked " + numTerms + " terms, " + numWords + " words.");
    }
  }
}