/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.inpipe; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; import com.bizosys.hsearch.common.ByteField; import com.bizosys.hsearch.filter.Storable; import com.bizosys.hsearch.index.Doc; import com.bizosys.hsearch.index.DocContent; import com.bizosys.hsearch.index.DocMeta; import com.bizosys.hsearch.index.DocTeaser; import com.bizosys.hsearch.index.DocTerms; import com.bizosys.hsearch.index.IndexLog; import com.bizosys.hsearch.index.Term; import com.bizosys.hsearch.index.TermType; import com.bizosys.hsearch.inpipe.util.ReaderType; import com.bizosys.hsearch.lang.Stemmer; import com.bizosys.hsearch.util.DataConstants; import com.bizosys.hsearch.util.ObjectFactory; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.util.StringUtils; /** * This is an abstract class which reads the various dimensions of * the document and tokenizes them including ID, URL, Fields, Title. * @author karan * */ public abstract class TokenizeBase { private static final boolean DEBUG_ENABLED = IndexLog.l.isDebugEnabled(); /** * Pack different sections with different readers. * This potentially helps on weight assignment. * @param aDoc A document * @return Reader types */ protected List<ReaderType> getReaders(Doc aDoc) throws SystemFault, ApplicationFault { List<ReaderType> readers = new ArrayList<ReaderType>(); DocTeaser teaser = aDoc.teaser; DocContent content = aDoc.content; DocMeta meta = aDoc.meta; if ( null == aDoc.terms) aDoc.terms = new DocTerms(); DocTerms terms = aDoc.terms; if ( null != content) { //The content fields if ( null != content.analyzedIndexed) addReader(aDoc.tenant, content.analyzedIndexed, terms, readers, Term.TERMLOC_XML,true); if ( null != content.nonAnalyzedIndexed) addReader(aDoc.tenant, content.nonAnalyzedIndexed, terms, readers, Term.TERMLOC_XML,false); } /** * Add the non analyzed ID field */ if ( null != teaser.id) { addReader(aDoc.tenant, new ByteField(TermType.URL_OR_ID, teaser.id), terms, readers, Term.TERMLOC_URL, false); } /** * Adding the URL first */ String url = teaser.getUrl(); if ( ! StringUtils.isEmpty(url) ) { if ( url.startsWith("http") || url.startsWith("file") ) { url = StringUtils.replaceMultipleCharsToAnotherChar( url, DataConstants.URL_SEPARATOR, ' '); StringTokenizer tokenizer = new StringTokenizer (url," "); List<ByteField> fields = new ArrayList<ByteField>(); while (tokenizer.hasMoreTokens()) { String urlToken = tokenizer.nextToken(); if ( urlToken.length() < 2) continue; fields.add(new ByteField(TermType.URL_OR_ID, urlToken)); } addReader(aDoc.tenant, fields, terms, readers, Term.TERMLOC_URL, true); } } /** * Adding the title */ if ( null != teaser.getTitle()) { ByteField title = new ByteField(TermType.TITLE, teaser.getTitle()); addReader(aDoc.tenant, title,terms,readers,Term.TERMLOC_SUBJECT, true); } /** * Adding the cached text */ if ( null != teaser.getCachedText()) { ByteField cached = new ByteField(TermType.BODY, teaser.getCachedText()); addReader(aDoc.tenant, cached,terms,readers,Term.TERMLOC_BODY, true); } /** * Adding the Keywords */ if ( null != meta.tags || null != meta.socialText) { List<ByteField> keywords = ObjectFactory.getInstance().getFieldList(); if ( null != meta.tags ){ for (String keyword : meta.getTags()) { keywords.add(new ByteField(TermType.KEYWORD, keyword)); } addReader(aDoc.tenant, keywords,terms,readers,Term.TERMLOC_BODY, false); } keywords.clear(); if ( null != meta.socialText ){ for (String keyword : meta.getSocialText()) { keywords.add(new ByteField(TermType.KEYWORD, keyword)); } addReader(aDoc.tenant, keywords,terms,readers,Term.TERMLOC_KEYWORD, false); } ObjectFactory.getInstance().putFieldList(keywords); } return readers; } private void addReader(String tenant, List<ByteField> fields, DocTerms terms, List<ReaderType> readers, Character termLoc, boolean analyze) throws SystemFault, ApplicationFault { for (ByteField fld: fields) { addReader(tenant, fld, terms, readers, termLoc, analyze); } } private void addReader(String tenant, ByteField fld, DocTerms terms, List<ReaderType> readers, Character termLoc, boolean analyze) throws SystemFault, ApplicationFault { String text = null; if (fld.type == Storable.BYTE_STRING) { Object objStr = fld.getValue(); if ( null == objStr) return; text = (String) objStr; } else if (fld.type == Storable.BYTE_STORABLE) { text = fld.getValue().toString(); } if ( null == text) throw new SystemFault("TokenizerBase: Unknow data type :" + fld.toString()); text = text.toLowerCase(); text = StringUtils.replaceMultipleCharsToAnotherChar( text, DataConstants.FIELDVAL_SEPARATOR, ' '); boolean oneWord = text.indexOf(' ') < 0 ; if (oneWord || !analyze) { text = Stemmer.getInstance().stem(text); Term term = new Term(tenant, text,termLoc,fld.name,0); if ( DEBUG_ENABLED) IndexLog.l.debug("Term added > " + term.toString()); terms.getTermList().add(term); } else { InputStream ba = new ByteArrayInputStream( fld.toBytes()); InputStreamReader is = new InputStreamReader(ba); if ( DEBUG_ENABLED) IndexLog.l.debug("Reader added >" + fld.name); readers.add(new ReaderType(termLoc,fld.name,is)); } } }