/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tallison.lucene.syns;
import java.nio.file.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
class SyntacticSynsConfig {
private final static String NGRAM_FIELD = "ngrams";
private final static String NGRAM_INDEX_NAME = "ngram_index";
private final static String CONTEXT_FIELD = "c";
private final static String CONTEXT_INDEX_NAME = "context_index";
private final static String SYNS_TARGET_FIELD_NAME = "target";
private final static String SYNS_PRE_FIELD_NAME = "pre";
private final static String SYNS_POST_FIELD_NAME = "post";
private final static String SYNS_INDEX_NAME = "syns_index";
private final Path indexDir;
private int maxKeyPhraseLength = 3;
private int maxContextTokenCount = 9000;//max number of context tokens in a field, if > than this normalize
private int maxTargetTypeCount = 50000;// if the key shows up this many times, ignore it; this is meant to keep, e.g. "the" from being a key
private int minKeyPhraseTermFrequency = 1;// min term frequency of a given key phrase
private int minTargetCount = 5;// min document frequency of a given key's contexts
private int minContextTypeCount = 1;//minimum type for a context
private int minContextTokenCount = 1;//minimum tokens for a context
private int maxTermsPerFieldInQuery = 30;//how many types to include in both the pre and post queries
private int minContextFrequencyInQuery = 5;
private int preContextSize = 1;
private int postContextSize = 1;
private final int maxTokenToReadCount = -1;//max tokens to read from input string
private final Analyzer baseAnalyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET);
public SyntacticSynsConfig(Path indexDir) {
this.indexDir = indexDir;
}
public static String getNgramField() {
return NGRAM_FIELD;
}
public static String getContextField() {
return CONTEXT_FIELD;
}
public static String getSynsTargetFieldName() {
return SYNS_TARGET_FIELD_NAME;
}
public static String getSynsPreFieldName() {
return SYNS_PRE_FIELD_NAME;
}
public static String getSynsPostFieldName() {
return SYNS_POST_FIELD_NAME;
}
public int getMaxTermsPerFieldInQuery() {
return maxTermsPerFieldInQuery;
}
public void setMaxTermsPerFieldInQuery(int maxTermsPerFieldInQuery) {
this.maxTermsPerFieldInQuery = maxTermsPerFieldInQuery;
}
public int getMaxKeyPhraseLength() {
return maxKeyPhraseLength;
}
public void setMaxKeyPhraseLength(int maxKeyPhraseLength) {
this.maxKeyPhraseLength = maxKeyPhraseLength;
}
public int getMaxContextTokenCount() {
return maxContextTokenCount;
}
public void setMaxContextTokenCount(int maxContextTokenCount) {
this.maxContextTokenCount = maxContextTokenCount;
}
public int getMaxTargetTypeCount() {
return maxTargetTypeCount;
}
public void setMaxTargetTypeCount(int maxTargetTypeCount) {
this.maxTargetTypeCount = maxTargetTypeCount;
}
public int getMinKeyPhraseTermFrequency() {
return minKeyPhraseTermFrequency;
}
public void setMinKeyPhraseTermFrequency(int minKeyPhraseTermFrequency) {
this.minKeyPhraseTermFrequency = minKeyPhraseTermFrequency;
}
public int getMinTargetCount() {
return minTargetCount;
}
public void setMinTargetCount(int minTargetCount) {
this.minTargetCount = minTargetCount;
}
public int getMinContextTypeCount() {
return minContextTypeCount;
}
public void setMinContextTypeCount(int minContextTypeCount) {
this.minContextTypeCount = minContextTypeCount;
}
public int getMinContextTokenCount() {
return minContextTokenCount;
}
public void setMinContextTokenCount(int minContextTokenCount) {
this.minContextTokenCount = minContextTokenCount;
}
public int getPreContextSize() {
return preContextSize;
}
public void setPreContextSize(int preContextSize) {
this.preContextSize = preContextSize;
}
public int getPostContextSize() {
return postContextSize;
}
public void setPostContextSize(int postContextSize) {
this.postContextSize = postContextSize;
}
public Path getNGramIndex() {
return indexDir.resolve(NGRAM_INDEX_NAME);
}
public Path getContextIndex() {
return indexDir.resolve(CONTEXT_INDEX_NAME);
}
public Path getSynsIndex() {
return indexDir.resolve(SYNS_INDEX_NAME);
}
public Path getIndexDir() {
return indexDir;
}
public int getMinContextFrequencyInQuery() {
return minContextFrequencyInQuery;
}
public void setMinContextFrequencyInQuery(int minContextFrequencyInQuery) {
this.minContextFrequencyInQuery = minContextFrequencyInQuery;
}
/**
*
* @return maximum number of tokens to read from the original string
*/
public int getMaxTokenToReadCount() {
return maxTokenToReadCount;
}
/**
* This is the base analyzer to use for the initial input string.
* The NGramIndexBuilders wrap this in a ShingleAnalyzerWrapper.
*
* @return analyzer to use for the initial input string
*/
public Analyzer getBaseAnalyzer() {
return baseAnalyzer;
}
}