package org.fastcatsearch.ir.dictionary;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.fastcatsearch.ir.io.CharVector;
import org.fastcatsearch.ir.io.DataInput;
import org.fastcatsearch.ir.io.DataOutput;
import org.fastcatsearch.ir.util.CharVectorHashSet;
import org.fastcatsearch.ir.util.CharVectorUtils;
import org.fastcatsearch.plugin.analysis.AnalysisPluginSetting.ColumnSetting;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
public class SynonymDictionary extends MapDictionary {
private Set<CharVector> wordSet;
public SynonymDictionary(){
this(false);
}
public SynonymDictionary(boolean isIgnoreCase) {
super(isIgnoreCase);
if(wordSet == null) {
wordSet = new CharVectorHashSet(isIgnoreCase);
}
}
public SynonymDictionary(File file, boolean isIgnoreCase) {
super(file, isIgnoreCase);
if(wordSet == null) {
wordSet = new CharVectorHashSet(isIgnoreCase);
}
}
public SynonymDictionary(InputStream is, boolean isIgnoreCase) {
super(is, isIgnoreCase);
if(wordSet == null) {
wordSet = new CharVectorHashSet(isIgnoreCase);
}
}
public Set<CharVector> getWordSet() {
return wordSet;
}
public void setWordSet(Set<CharVector> wordSet) {
this.wordSet = wordSet;
}
public Set<CharVector> getUnmodifiableWordSet() {
return Collections.unmodifiableSet(wordSet);
}
private CharVector[] duplicateCharList(CharVector[] arr){
if(arr != null) {
CharVector[] list = new CharVector[arr.length];
System.arraycopy(arr, 0, list, 0, arr.length);
return list;
}
return null;
}
// key가 null일수 있다. 양방향의 경우.
@Override
public void addEntry(String keyword, Object[] values, List<ColumnSetting> columnSettingList) {
ArrayList<CharVector> list = new ArrayList<CharVector>(4);
CharVector mainWord = null;
if (keyword != null) {
keyword = keyword.trim();
if (keyword.length() > 0) {
mainWord = new CharVector(keyword);
wordSet.add(mainWord);
if(mainWord.hasWhitespaces()) {
for(CharVector w : CharVectorUtils.splitByWhitespace(mainWord)) {
wordSet.add(w);
}
}
}
}
if (values == null || values.length == 0) {
return;
}
// 0번째에 유사어들이 컴마 단위로 모두 입력되어 있으므로 [0]만 확인하면 된다.
String valueString = values[0].toString();
// 중복제거.
String[] synonyms = valueString.split(",");
dedupSynonym(synonyms);
for (int k = 0; k < synonyms.length; k++) {
String synonym = synonyms[k].trim();
if (synonym.length() > 0) {
CharVector word = new CharVector(synonym);
list.add(word);
wordSet.add(word);
if(word.hasWhitespaces()) {
for(CharVector w : CharVectorUtils.splitByWhitespace(word)) {
wordSet.add(w);
}
}
}
}
if (mainWord == null) {
// 양방향.
for (int j = 0; j < list.size(); j++) {
CharVector key = list.get(j);
CharVector[] value = new CharVector[list.size() - 1];
int idx = 0;
for (int k = 0; k < list.size(); k++) {
CharVector val = list.get(k);
if (!key.equals(val)) {
// 다른것만 value로 넣는다.
value[idx++] = val;
}
}
// 유사어사전 데이터에 대표단어와 동일한 단어가 여러개 있을경우, 최종리스트는 더 적어지게 되므로 전체 array
// 길이를 줄여준다.
if (idx < value.length) {
value = Arrays.copyOf(value, idx);
}
if (value.length > 0) {
CharVector[] value2 = map.get(key);
if (value2 != null) {
// 이전값과 머징.
value2 = duplicateCharList(value2);
value = mergeSynonyms(value2, value);
}
map.put(key, value);
//공백을 제거한 key도 하나더 만든다.
if(key.hasWhitespaces()) {
key = key.duplicate().removeWhitespaces();
value2 = map.get(key);
if (value2 != null) {
// 이전값과 머징.
value2 = duplicateCharList(value2);
value = mergeSynonyms(value2, value);
}
map.put(key, value);
}
//logger.debug("유사어 양방향 {} >> {}", key, join(value));
}
}
} else {
// 단방향.
CharVector[] value = new CharVector[list.size()];
int idx = 0;
for (int j = 0; j < value.length; j++) {
CharVector word = list.get(j);
if (!mainWord.equals(word)) {
// 다른것만 value로 넣는다.
value[idx++] = word;
}
}
if (idx < value.length) {
value = Arrays.copyOf(value, idx);
}
if (value.length > 0) {
CharVector[] value2 = map.get(mainWord);
if (value2 != null) {
// 이전값과 머징.
value2 = duplicateCharList(value2);
value = mergeSynonyms(value2, value);
}
//
//입력시 키워드는 공백제거.
//
map.put(mainWord, value);
//공백이 포함되어 있다면, 제거한 단어도 함께 넣어준다.
if(mainWord.hasWhitespaces()) {
map.put(mainWord.duplicate().removeWhitespaces(), value);
}
//logger.debug("유사어 단방향 {} >> {}", mainWord, join(value));
}
}
}
// 중복제거한다. 중복이 발견되면 "" 로 치환한다.
private void dedupSynonym(String[] list) {
if (list == null || list.length < 2) {
return;
}
for (int i = 0; i < list.length; i++) {
for (int j = i + 1; j < list.length; j++) {
if (list[j].length() != 0 && list[i].equals(list[j])) {
list[j] = "";
}
}
}
return;
}
private CharVector[] mergeSynonyms(CharVector[] value2, CharVector[] value) {
int removedCount = 0;
for (int i = 0; i < value.length; i++) {
for (int j = i + 1; j < value2.length; j++) {
if (value2[j] != null && value[i].equals(value2[j])) {
value2[j] = null;
removedCount++;
}
}
}
int newSize = value2.length + value.length - removedCount;
CharVector[] list = new CharVector[newSize];
int i = 0;
for(CharVector v : value2){
if(v != null){
list[i++] = v;
}
}
for(CharVector v : value){
if(v != null){
list[i++] = v;
}
}
return list;
}
@Override
public void writeTo(OutputStream out) throws IOException {
super.writeTo(out);
DataOutput output = new OutputStreamDataOutput(out);
// write size of synonyms
output.writeVInt(wordSet.size());
// write synonyms
Iterator<CharVector> synonymIter = wordSet.iterator();
for (; synonymIter.hasNext();) {
CharVector value = synonymIter.next();
output.writeUString(value.array(), value.start(), value.length());
}
}
@Override
public void readFrom(InputStream in) throws IOException {
super.readFrom(in);
DataInput input = new InputStreamDataInput(in);
wordSet = new CharVectorHashSet(ignoreCase);
int size = input.readVInt();
for (int entryInx = 0; entryInx < size; entryInx++) {
wordSet.add(new CharVector(input.readUString()));
}
}
@Override
public void reload(Object object) throws IllegalArgumentException {
if(object != null && object instanceof SynonymDictionary){
super.reload(object);
SynonymDictionary synonymDictionary = (SynonymDictionary) object;
this.wordSet = synonymDictionary.getWordSet();
}else{
throw new IllegalArgumentException("Reload dictionary argument error. argument = " + object);
}
}
@Override
public void clear() {
super.clear();
wordSet.clear();
}
}