/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.dictionary;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import com.bizosys.oneline.util.StringUtils;
import com.bizosys.hsearch.filter.IStorable;
import com.bizosys.hsearch.filter.Storable;
/**
* Represents an entry in the dictionary
* @author Abinasha karana
*/
public class DictEntry implements IStorable{
public static final String TYPE_SEPARATOR = "\t";
/**
* The stemmed word
*/
public String word = null;
/**
* The word type
*/
public String type = null;
/**
* Number of documents in which this word is sighted
*/
public int frequency = 1;
/**
* Synonums of this word
*/
public String related = null;
/**
* The original unstemmed word
*/
public String detail = null;
private int pos = 0;
/**
* Private Default Constructor
*/
private DictEntry(){}
public DictEntry ( byte[] value) {
fromBytes(value,0);
}
/**
* Constructor, Initialize by deserializing the stored bytes
* @param value
*/
public DictEntry ( byte[] value, int bytePos) {
fromBytes(value, bytePos);
}
/**
* Constructor
* @param fldWord The stemmed word
*/
public DictEntry(String fldWord) {
this.word = fldWord;
}
/**
* Constructor
* @param fldWord The stemmed word
* @param fldType The word type
* @param fldFreq No. of documents containing this word
* @param related Synonums of this word
* @param fldDetailXml Detail about this word like the thesaurus heirarchy
*/
public DictEntry(String fldWord, String fldType,
int fldFreq, String related, String fldDetailXml) {
this.word = fldWord;
if ( null != fldType ) this.type = fldType.trim().toLowerCase();
this.frequency = fldFreq;
this.related = related;
this.detail = fldDetailXml;
}
/**
* Constructor
* @param fldWord The stemmed word
* @param fldType The word type
* @param fldFreq No. of documents containing this word
*/
public DictEntry(String fldWord, String fldType, Integer fldFreq ) {
this.word = fldWord;
if ( null != fldType ) this.type = fldType.trim().toLowerCase();
this.frequency = fldFreq;
}
/**
* Add synonums word. Add all synonums in a comma separated way.
* @param related Related words
*/
public void addRelatedWord(String related) {
if (DictLog.l.isDebugEnabled()) DictLog.l.debug(" Related " + related);
this.related = related;
}
/**
* Serialize the document entry
*/
public byte[] toBytes() {
byte[] fldWordB = ( null == word) ? null : Storable.putString(word);
byte[] fldTypeB = ( null == type) ? null : Storable.putString(type);
byte[] fldFreqB = Storable.putInt(frequency);
byte[] fldRelatedB = ( null == related) ? null : Storable.putString(related);
byte[] fldDetailXmlB = ( null == detail) ? null : Storable.putString(detail);
int fldWordLen = ( null == fldWordB) ? 0 : fldWordB.length;
int fldTypeLen = ( null == fldTypeB) ? 0 : fldTypeB.length;
int fldRelatedLen = ( null == fldRelatedB) ? 0 : fldRelatedB.length;
int fldDetailXmlLen = ( null == fldDetailXmlB) ? 0 : fldDetailXmlB.length;
int totalBytes = fldWordLen + fldTypeLen +
fldFreqB.length + fldRelatedLen + fldDetailXmlLen;
byte[] fldWordLenB = Storable.putShort((short) fldWordLen);
byte[] fldTypeLenB = Storable.putShort((short) fldTypeLen);
byte[] fldRelatedLenB = Storable.putShort((short) fldRelatedLen);
byte[] fldDetailXmlLenB = Storable.putShort((short) fldDetailXmlLen);
byte[] value = new byte[totalBytes + 8];
int pos = 0;
System.arraycopy(fldWordLenB, 0, value, pos, 2);
pos = pos + 2;
if ( 0 != fldWordLen) {
System.arraycopy(fldWordB, 0, value, pos, fldWordLen);
pos = pos + fldWordLen;
}
System.arraycopy(fldTypeLenB, 0, value, pos, 2);
pos = pos + 2;
if ( 0 != fldTypeLen) {
System.arraycopy(fldTypeB, 0, value, pos, fldTypeLen);
pos = pos + fldTypeLen;
}
System.arraycopy(fldFreqB, 0, value, pos, fldFreqB.length);
pos = pos + fldFreqB.length;
System.arraycopy(fldRelatedLenB, 0, value, pos, 2);
pos = pos + 2;
if ( 0 != fldRelatedLen) {
System.arraycopy(fldRelatedB, 0, value, pos, fldRelatedLen);
pos = pos + fldRelatedLen;
}
System.arraycopy(fldDetailXmlLenB, 0, value, pos, 2);
pos = pos + 2;
if ( 0 != fldDetailXmlLen) {
System.arraycopy(fldDetailXmlB, 0, value, pos, fldDetailXmlLen);
pos = pos + fldDetailXmlLen;
}
return value;
}
public int fromBytes(byte[] data, int readPos) {
this.pos = readPos;
short fldWordLen = Storable.getShort(pos, data);
pos = pos + 2;
if ( 0 != fldWordLen) {
byte[] fldWordB = new byte[fldWordLen];
System.arraycopy(data, pos, fldWordB, 0, fldWordLen);
this.word = Storable.getString(fldWordB);
pos = pos + fldWordLen;
}
short fldTypeLen = Storable.getShort(pos, data);
pos = pos + 2;
if ( 0 != fldTypeLen) {
byte[] fldTypeB = new byte[fldTypeLen];
System.arraycopy(data, pos, fldTypeB, 0, fldTypeLen);
this.type = Storable.getString(fldTypeB);
pos = pos + fldTypeLen;
}
this.frequency = Storable.getInt(pos, data);
pos = pos + 4;
short fldRelatedLen = Storable.getShort(pos, data);
pos = pos + 2;
if ( 0 != fldRelatedLen) {
byte[] fldRelatedB = new byte[fldRelatedLen];
System.arraycopy(data, pos, fldRelatedB, 0, fldRelatedLen);
this.related = Storable.getString(fldRelatedB);
pos = pos + fldRelatedLen;
}
short fldDetailXmlLen = Storable.getShort(pos, data);
pos = pos + 2;
if ( 0 != fldDetailXmlLen) {
byte[] fldDetailXmlB = new byte[fldDetailXmlLen];
System.arraycopy(data, pos, fldDetailXmlB, 0, fldDetailXmlLen);
this.detail = Storable.getString(fldDetailXmlB);
pos = pos + fldDetailXmlLen;
}
return pos;
}
/**
* Add a type to the word. Example "Bangalore" is a "City"
* @param foundTypes The word type
*/
public void addType(String foundTypes) {
if ( StringUtils.isEmpty(foundTypes)) return;
foundTypes = foundTypes.toLowerCase().trim();
if ( null == this.type) {
this.type = foundTypes;
return;
}
/**
* foundTypes=BODY,TITLE and this.type=TITLE
*/
List<String> lstType = StringUtils.fastSplit(foundTypes, ',');
for (String aType : lstType) {
if ( StringUtils.isEmpty(aType)) return;
if ( this.type.indexOf(aType) == -1)
this.type = this.type + TYPE_SEPARATOR + aType;
}
}
/**
* Get all types associated to this word.
* Ex. Hydrogen is a "Molecule" as well as a "Fuel"
* @return All types
*/
public List<String> getTypes() {
if (StringUtils.isEmpty(this.type)) return null;
StringTokenizer tokenizer = new StringTokenizer (this.type, TYPE_SEPARATOR);
List<String> values = new ArrayList<String>();
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
if (StringUtils.isEmpty(token)) continue;
values.add(token);
}
return values;
}
/**
* Forms a XML representation of this entry
* @param writer Writer
* @throws IOException Write exception
*/
public void toXml(Writer writer) throws IOException {
writer.append("<aword>");
if ( null != this.word ) writer.append("<word>").append(this.word).append("</word>");
if ( null != this.type ) writer.append("<type>").append(this.type.replace('\t', ',')).append("</type>");
writer.append("<frequency>").append(new Integer(this.frequency).toString()).append("</frequency>");
if ( null != this.related ) writer.append("<related>").append(this.related.replace('\t', ',')).append("</related>");
if ( null != this.detail ) writer.append("<detail>").append(this.detail).append("</detail>");
writer.append("</aword>");
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(100);
if ( null != this.word ) sb.append(" Word:[").append(this.word).append(']');
if ( null != this.type ) sb.append(" , Type:[").append(this.type.replace('\t', ',')).append(']');
sb.append(" , Freq:[").append(this.frequency).append(']');
if ( null != this.related ) sb.append(" , Related:[").append(this.related.replace('\t', ',')).append(']');
if ( null != this.detail )sb.append(" , Detail:[").append(this.detail).append(']');
return sb.toString();
}
}