TokenizeBase.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch.inpipe;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import com.bizosys.hsearch.common.ByteField;
import com.bizosys.hsearch.filter.Storable;
import com.bizosys.hsearch.index.Doc;
import com.bizosys.hsearch.index.DocContent;
import com.bizosys.hsearch.index.DocMeta;
import com.bizosys.hsearch.index.DocTeaser;
import com.bizosys.hsearch.index.DocTerms;
import com.bizosys.hsearch.index.IndexLog;
import com.bizosys.hsearch.index.Term;
import com.bizosys.hsearch.index.TermType;
import com.bizosys.hsearch.inpipe.util.ReaderType;
import com.bizosys.hsearch.lang.Stemmer;
import com.bizosys.hsearch.util.DataConstants;
import com.bizosys.hsearch.util.ObjectFactory;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.util.StringUtils;

/**
 * This is an abstract class which reads the various dimensions of 
 * the document and tokenizes them including ID, URL, Fields, Title. 
 * @author karan
 *
 */
public abstract class TokenizeBase {
	
	private static final boolean DEBUG_ENABLED = IndexLog.l.isDebugEnabled();

	/**
	 * Pack different sections with different readers.
	 * This potentially helps on weight assignment.
	 * @param aDoc	A document
	 * @return	Reader types
	 */
	protected List<ReaderType> getReaders(Doc aDoc) 
	throws SystemFault, ApplicationFault {
		
		List<ReaderType> readers = new ArrayList<ReaderType>();
		DocTeaser teaser = aDoc.teaser;
		DocContent content = aDoc.content;
		DocMeta meta = aDoc.meta;
		if ( null == aDoc.terms) aDoc.terms = new DocTerms(); 
		DocTerms terms = aDoc.terms;
		
		if ( null != content) { //The content fields
			if ( null != content.analyzedIndexed) 
				addReader(aDoc.tenant, content.analyzedIndexed, terms,  readers, Term.TERMLOC_XML,true);
			if ( null != content.nonAnalyzedIndexed) 
				addReader(aDoc.tenant, content.nonAnalyzedIndexed, terms,  readers, Term.TERMLOC_XML,false);
		}

		/**
		 * Add the non analyzed ID field
		 */
		if ( null != teaser.id) {
			addReader(aDoc.tenant, new ByteField(TermType.URL_OR_ID, teaser.id), terms, readers, Term.TERMLOC_URL, false);
		}
		
		/**
		 * Adding the URL first
		 */
		String url = teaser.getUrl();
		if ( ! StringUtils.isEmpty(url) ) {
			if ( url.startsWith("http") || url.startsWith("file") ) {
				url = StringUtils.replaceMultipleCharsToAnotherChar(
					url, DataConstants.URL_SEPARATOR, ' ');
			    StringTokenizer tokenizer = new StringTokenizer (url," ");
				List<ByteField> fields = new ArrayList<ByteField>(); 
			    while (tokenizer.hasMoreTokens()) {
			    	String urlToken = tokenizer.nextToken();
			    	if ( urlToken.length() < 2) continue;
			    	fields.add(new ByteField(TermType.URL_OR_ID, urlToken));
			    }
				addReader(aDoc.tenant, fields, terms,  readers, Term.TERMLOC_URL, true);
			}
		}

		/**
		 * Adding the title
		 */
		if ( null != teaser.getTitle()) {
			ByteField title = new ByteField(TermType.TITLE, teaser.getTitle());
			addReader(aDoc.tenant, title,terms,readers,Term.TERMLOC_SUBJECT, true);
		}
		
		/**
		 * Adding the cached text
		 */
		if ( null != teaser.getCachedText()) {
			ByteField cached = new ByteField(TermType.BODY, teaser.getCachedText());
			addReader(aDoc.tenant, cached,terms,readers,Term.TERMLOC_BODY, true);
		}
		
		/**
		 * Adding the Keywords
		 */
		if ( null != meta.tags || null != meta.socialText) {
			List<ByteField> keywords = ObjectFactory.getInstance().getFieldList();
			
			if ( null != meta.tags ){
				for (String keyword : meta.getTags()) {
					keywords.add(new ByteField(TermType.KEYWORD, keyword));
				}
				addReader(aDoc.tenant, keywords,terms,readers,Term.TERMLOC_BODY, false);
			}
			
			keywords.clear();
			
			if ( null != meta.socialText ){
				for (String keyword : meta.getSocialText()) {
					keywords.add(new ByteField(TermType.KEYWORD, keyword));
				}
				addReader(aDoc.tenant, keywords,terms,readers,Term.TERMLOC_KEYWORD, false);				
			}
			ObjectFactory.getInstance().putFieldList(keywords);
			
		}
		
		return readers;
	}

	private void addReader(String tenant, List<ByteField> fields, DocTerms terms,  
		List<ReaderType> readers, Character termLoc, boolean analyze) 
	throws SystemFault, ApplicationFault  {
		
		for (ByteField fld: fields) {
			addReader(tenant, fld, terms, readers, termLoc, analyze);
		}
	}
	
	private void addReader(String tenant, ByteField fld, DocTerms terms,
		List<ReaderType> readers, Character termLoc, boolean analyze)
		throws SystemFault, ApplicationFault {
		
		String text = null;
		if (fld.type == Storable.BYTE_STRING) {
			Object objStr = fld.getValue();
			if ( null == objStr) return;
			text = (String) objStr;
		} else if (fld.type == Storable.BYTE_STORABLE) {
			text = fld.getValue().toString();
		}

		if ( null == text) throw 
			new SystemFault("TokenizerBase: Unknow data type :" + fld.toString());
		
		text = text.toLowerCase();
		text = StringUtils.replaceMultipleCharsToAnotherChar(
				text, DataConstants.FIELDVAL_SEPARATOR, ' ');		
		boolean oneWord = text.indexOf(' ') < 0 ;
		if (oneWord || !analyze) {
			text = Stemmer.getInstance().stem(text);
			Term term = new Term(tenant, text,termLoc,fld.name,0);
			if ( DEBUG_ENABLED) IndexLog.l.debug("Term added > " + term.toString());
			terms.getTermList().add(term);
		} else { 
			InputStream ba = new ByteArrayInputStream( fld.toBytes());
			InputStreamReader is = new InputStreamReader(ba);
			if ( DEBUG_ENABLED) IndexLog.l.debug("Reader added >" + fld.name);
			readers.add(new ReaderType(termLoc,fld.name,is));
		} 							
	}
}