/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.index;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.util.StringUtils;
/**
* Each word stores as a term inside the index.
* @author karan
*/
public class Term {
public static Character TERMLOC_URL = 'U';
public static Character TERMLOC_SUBJECT = 'S';
public static Character TERMLOC_BODY = 'B';
public static Character TERMLOC_META = 'M';
public static Character TERMLOC_XML = 'X';
public static Character TERMLOC_KEYWORD = 'K';
public static String NO_TERM_TYPE = "";
public static String TERMTYPE_ACRONUM = "ACR";
public static String TERMTYPE_DATE = "DATE";
public static String TERMTYPE_EMAIL = "MAIL";
public static String TERMTYPE_ID = "ID";
public static String TERMTYPE_URL = "URL";
public static String TERMTYPE_NOUN = "NAME";
public static String TERMTYPE_PHONE = "PHONE";
public static String TERMTYPE_LINKTEXT = "LNKTXT";
public static String TERMTYPE_MIME = "MM";
/**
* This is the position from which we use position jump
* to calculate the position.
*/
public static int POSITION_JUMP_FROM = 65000;
/**
* This is the serial position of document in the bucket.
* A bucket will have capability to store till 65536 documents
*/
private short docPos = Short.MIN_VALUE;
/**
* The document type (Variation 256)
* Now we can support total 256 types of document.
* We can map one ID for different document type.
* This will later can be filtered reading the meta fields
* (Low probability of clashing)
*/
private byte docTypeCode = Byte.MIN_VALUE;
/**
* The Term type (Variation 256)
* Now we can support total 256 types of term types.
* This is OK as we can map multiple types to same id
* avoiding duplication in docType level (Low probability of clashing)
*/
private byte termTypeCode = Byte.MIN_VALUE ;
/**
* Term Weight will be from 0-256
*/
private byte weight = 0;
/**
* Position of term in the document
*/
private short termPos = Short.MIN_VALUE;
/**
* The frequency of term in the document
*/
private byte termFreq = 1;
/**
* Intermediate computation fields
*/
public String term;
public String termType;
public Character sightting;
/**
* A term weight assigned based on the type code
* Weight gets assigned from 0-100
*/
private byte typeCodeWeight = 0;
/**
* Rest 28 houses are for frequencies, It follows an exponential decay function with
* max out at 10.
* */
private byte freqWeight = 0;
public Term() {
}
/**
* The stored term
* @param docPos
* @param docTypeCode
* @param termTypeCode
* @param weight
* @param termPos
* @param termFreq
* @throws ApplicationFault
*/
public Term(short docPos, byte docTypeCode, byte termTypeCode,
byte weight, int termPos, byte termFreq ) {
this.docPos = docPos;
this.docTypeCode = docTypeCode;
this.termTypeCode = termTypeCode;
this.weight = weight;
this.termPos = setTermPos(termPos);
this.termFreq = termFreq;
}
/**
*
* @param term Text term
* @param sightting Sightting location
* @param termType Term type
* @param termPos term position
* @throws ApplicationFault
* @throws SystemFault
*/
public Term(String tenant, String term, Character sightting,
String termType, Integer termPos ) throws ApplicationFault, SystemFault {
if ( StringUtils.isEmpty(term) ) return;
this.term = term;
if ( null != termType) {
if ( termType.length() > 24 ) {
IndexLog.l.warn("The Term Type is Too Long:" + termType + "\nTenant=" + tenant);
termType = null;
}
}
this.termType = termType;
if ( null != termType ) {
this.termTypeCode = TermType.getInstance(true).
getTypeCode(tenant, termType);
this.typeCodeWeight = WeightType.getInstance(true).
getTypeCode(tenant, termType);
}
this.termFreq = 1;
this.freqWeight = 18;
this.sightting = sightting;
this.termPos = setTermPos(termPos);
}
public Term(String term, Character sightting,
byte termTypeCode, Integer termPos ) {
if ( StringUtils.isEmpty(term) ) return;
this.term = term;
this.termTypeCode = termTypeCode;
this.sightting = sightting;
this.termPos = setTermPos(termPos);
}
public Term(String term, Character sightting,
byte termTypeCode, Integer termPos, short docPos, byte termWeight ) {
this(term,sightting,termTypeCode,termPos);
this.setDocumentPosition(docPos);
this.setTermWeight(termWeight);
}
public void resetTerm(String term) {
this.docPos = Short.MIN_VALUE;
this.docTypeCode = Byte.MIN_VALUE;
this.termTypeCode = Byte.MIN_VALUE ;
this.termTypeCode = Byte.MIN_VALUE ;
this.weight = Byte.MIN_VALUE;
this.termPos = Short.MIN_VALUE;
}
/**
* This will be from -1 till 65530
* 32232 + Increment 1 for each 100000 (This is After 65000)
* @param termPos actural term position
* @return Loss approximate term position (short)
*/
public short setTermPos(int termPos) {
if ( termPos < 65000 ) {
short termPosCur = new Integer(termPos).shortValue();
return (short)(Short.MIN_VALUE + termPosCur + 1);
}
int jump = (termPos - 65000) / 100000;
return (short)( 32232 + jump );
}
public int getTermPos(short termPos) {
if ( termPos <= 32232 ) {
return ( (-1 * Short.MIN_VALUE) + termPos - 1);
}
int jump = (termPos - 32232) * 100000;
return 65000 + jump ;
}
/**
* We are discounting the term count. Rather we are counting
* the sighting location for merging.
* @param term
*/
public boolean merge(Term term) {
/**
* Not the term from same document
*/
if ( this.docPos != term.docPos) return false;
if ( -32768 == term.docPos) return false;
/*
* Term repetition in the same document
*/
if ( term.weight > this.weight) {
this.weight = term.weight;
this.sightting = term.sightting;
if ( -1 != term.termPos ) this.termPos = term.termPos;
}
if (term.typeCodeWeight > typeCodeWeight) {
this.termType = term.termType;
this.typeCodeWeight = term.typeCodeWeight;
}
int totalFreq = this.termFreq + term.termFreq;
if (totalFreq > Byte.MAX_VALUE) this.termFreq = Byte.MAX_VALUE;
else this.termFreq = (byte) totalFreq;
setTermFrequency(this.termFreq);
return true;
}
public short getDocumentPosition() {
return this.docPos;
}
public void setDocumentPosition(short pos) {
this.docPos = pos;
}
public byte getDocumentTypeCode() {
return this.docTypeCode;
}
public void setDocumentTypeCode(byte type) {
this.docTypeCode = type;
}
public byte getTermTypeCode() {
return this.termTypeCode;
}
public void setTermTypeCode(byte type) {
this.termTypeCode = type;
}
public short getTermPosition() {
return this.termPos;
}
public void setTermWeight(short termPos ) {
this.termPos = termPos;
}
public byte getTermWeight() {
return (byte) ( this.weight + typeCodeWeight + freqWeight);
}
public void setTermWeight(byte weight) {
this.weight = weight;
}
public byte getTermFrequency() {
return this.termFreq;
}
public void setTermFrequency(byte termFreq) {
if ( termFreq < 1) this.termFreq = 1;
else this.termFreq = termFreq;
switch ( this.termFreq ) {
case 1:
this.freqWeight = 18; break;
case 2:
this.freqWeight = 20; break;
case 3:
this.freqWeight = 21; break;
case 4:
this.freqWeight = 22; break;
case 5:
this.freqWeight = 23; break;
case 6:
this.freqWeight = 24; break;
case 7:
this.freqWeight = 25; break;
case 8:
this.freqWeight = 26; break;
case 9:
this.freqWeight = 27; break;
default :
this.freqWeight = 28; break;
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Term :" ).append(term);
sb.append(" , Doc Pos :" ).append(docPos);
sb.append(" , Doc Type :" ).append(docTypeCode);
sb.append(" , Term Pos :" ).append(termPos);
if ( null != termType ) sb.append(" , Term Type :" ).append(termType);
sb.append(" , Term Freq :" ).append(termFreq);
sb.append(" , Term Weight :" ).append(weight + "/" + typeCodeWeight + "/" + freqWeight);
return sb.toString();
}
}