/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.tallison.lucene.syns; import java.nio.file.Path; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.standard.StandardAnalyzer; class SyntacticSynsConfig { private final static String NGRAM_FIELD = "ngrams"; private final static String NGRAM_INDEX_NAME = "ngram_index"; private final static String CONTEXT_FIELD = "c"; private final static String CONTEXT_INDEX_NAME = "context_index"; private final static String SYNS_TARGET_FIELD_NAME = "target"; private final static String SYNS_PRE_FIELD_NAME = "pre"; private final static String SYNS_POST_FIELD_NAME = "post"; private final static String SYNS_INDEX_NAME = "syns_index"; private final Path indexDir; private int maxKeyPhraseLength = 3; private int maxContextTokenCount = 9000;//max number of context tokens in a field, if > than this normalize private int maxTargetTypeCount = 50000;// if the key shows up this many times, ignore it; this is meant to keep, e.g. "the" from being a key private int minKeyPhraseTermFrequency = 1;// min term frequency of a given key phrase private int minTargetCount = 5;// min document frequency of a given key's contexts private int minContextTypeCount = 1;//minimum type for a context private int minContextTokenCount = 1;//minimum tokens for a context private int maxTermsPerFieldInQuery = 30;//how many types to include in both the pre and post queries private int minContextFrequencyInQuery = 5; private int preContextSize = 1; private int postContextSize = 1; private final int maxTokenToReadCount = -1;//max tokens to read from input string private final Analyzer baseAnalyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); public SyntacticSynsConfig(Path indexDir) { this.indexDir = indexDir; } public static String getNgramField() { return NGRAM_FIELD; } public static String getContextField() { return CONTEXT_FIELD; } public static String getSynsTargetFieldName() { return SYNS_TARGET_FIELD_NAME; } public static String getSynsPreFieldName() { return SYNS_PRE_FIELD_NAME; } public static String getSynsPostFieldName() { return SYNS_POST_FIELD_NAME; } public int getMaxTermsPerFieldInQuery() { return maxTermsPerFieldInQuery; } public void setMaxTermsPerFieldInQuery(int maxTermsPerFieldInQuery) { this.maxTermsPerFieldInQuery = maxTermsPerFieldInQuery; } public int getMaxKeyPhraseLength() { return maxKeyPhraseLength; } public void setMaxKeyPhraseLength(int maxKeyPhraseLength) { this.maxKeyPhraseLength = maxKeyPhraseLength; } public int getMaxContextTokenCount() { return maxContextTokenCount; } public void setMaxContextTokenCount(int maxContextTokenCount) { this.maxContextTokenCount = maxContextTokenCount; } public int getMaxTargetTypeCount() { return maxTargetTypeCount; } public void setMaxTargetTypeCount(int maxTargetTypeCount) { this.maxTargetTypeCount = maxTargetTypeCount; } public int getMinKeyPhraseTermFrequency() { return minKeyPhraseTermFrequency; } public void setMinKeyPhraseTermFrequency(int minKeyPhraseTermFrequency) { this.minKeyPhraseTermFrequency = minKeyPhraseTermFrequency; } public int getMinTargetCount() { return minTargetCount; } public void setMinTargetCount(int minTargetCount) { this.minTargetCount = minTargetCount; } public int getMinContextTypeCount() { return minContextTypeCount; } public void setMinContextTypeCount(int minContextTypeCount) { this.minContextTypeCount = minContextTypeCount; } public int getMinContextTokenCount() { return minContextTokenCount; } public void setMinContextTokenCount(int minContextTokenCount) { this.minContextTokenCount = minContextTokenCount; } public int getPreContextSize() { return preContextSize; } public void setPreContextSize(int preContextSize) { this.preContextSize = preContextSize; } public int getPostContextSize() { return postContextSize; } public void setPostContextSize(int postContextSize) { this.postContextSize = postContextSize; } public Path getNGramIndex() { return indexDir.resolve(NGRAM_INDEX_NAME); } public Path getContextIndex() { return indexDir.resolve(CONTEXT_INDEX_NAME); } public Path getSynsIndex() { return indexDir.resolve(SYNS_INDEX_NAME); } public Path getIndexDir() { return indexDir; } public int getMinContextFrequencyInQuery() { return minContextFrequencyInQuery; } public void setMinContextFrequencyInQuery(int minContextFrequencyInQuery) { this.minContextFrequencyInQuery = minContextFrequencyInQuery; } /** * * @return maximum number of tokens to read from the original string */ public int getMaxTokenToReadCount() { return maxTokenToReadCount; } /** * This is the base analyzer to use for the initial input string. * The NGramIndexBuilders wrap this in a ShingleAnalyzerWrapper. * * @return analyzer to use for the initial input string */ public Analyzer getBaseAnalyzer() { return baseAnalyzer; } }