/* * Copyright 2010 Bizosys Technologies Limited * * Licensed to the Bizosys Technologies Limited (Bizosys) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Bizosys licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.bizosys.hsearch.benchmark; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.apache.log4j.Logger; import com.bizosys.hsearch.common.Field; import com.bizosys.hsearch.common.HDocument; import com.bizosys.hsearch.common.HField; import com.bizosys.hsearch.util.FileReaderUtil; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.util.StringUtils; public class LuceneTabFileCrawler { private static Logger l = Logger.getLogger(LuceneTabFileCrawler.class.getName()); private int betchSize = 300; private String[] fields = null; private String[] startTag = null; private String[] endTag = null; private String recordStartTag = null; private String recordEndTag = null; private int startIndex = 0; private int endIndex = -1; private int totalFields = 0; private int totalFieldsMinue1 = 0; private File aFile = null; private int idIndex = -1; private boolean isTitle = false; private int[] titleIndex = null; private int titleIndexT = 0; private String idFldName = null; private String[] titleFldNames = null; private boolean isPreview = false; private int[] previewIndex = null; private int previewIndexT = 0; private String[] previewFldNames = null; private HDocument pristineDoc = null; private boolean isEndIndex = false; int readDocs = 0; long bucket = 0; private LuceneTabFileCrawler() { } public LuceneTabFileCrawler(String filePath, HDocument doc, String idFldName, String[] titleFldNames, String[] previewFields, int startIndex, int endIndex, int batchSize) throws ApplicationFault, SystemFault { this.pristineDoc = doc; this.aFile = FileReaderUtil.getFile(filePath); if ( !aFile.exists() ) { throw new ApplicationFault( "TabFileFetcher > File does not exist, " + this.aFile.getAbsolutePath()); } if ( !aFile.canRead() ) { throw new ApplicationFault( "TabFileFetcher > Can't read the file" + this.aFile.getAbsolutePath()); } String fileName = aFile.getName(); int dotAt = fileName.lastIndexOf('.'); String docName = ( dotAt == -1) ? fileName : fileName.substring(0, dotAt); this.recordStartTag = "<" + docName + ">"; this.recordEndTag= "</" + docName + ">"; this.idFldName = idFldName; this.titleFldNames = titleFldNames; this.previewFldNames = previewFields; if ( endIndex != -1) { if ( startIndex >= endIndex ) throw new ApplicationFault( "Not allowed as reading ends at " + endIndex); } this.startIndex = startIndex; if ( endIndex != -1 && endIndex <= startIndex ) throw new ApplicationFault("Not allowed as reading starts from " + startIndex); this.endIndex = endIndex; if ( this.endIndex > 0 ) isEndIndex = true; this.betchSize = batchSize; } /** * * @param runPlan * @throws ApplicationFault * @throws SystemFault */ public void fetchAndIndex() throws ApplicationFault, SystemFault { l.debug("TabFileFetcher > Loading " + this.aFile.getName()); readDocs = -1; BufferedReader reader = null; InputStream stream = null; String[] words = null; String line = null; try { stream = new FileInputStream(aFile); reader = new BufferedReader ( new InputStreamReader (stream) ); boolean isFirstLine = true; int counter = 0; List<String[]> records = new ArrayList<String[]>(50); while((line = reader.readLine()) !=null ) { if ( isFirstLine) { initialize(line); isFirstLine = false; continue; } if (line.length() == 0) continue; char first=line.charAt(0); switch (first) { case ' ' : case '\n' : case '#' : // skip blank & comment lines continue; } counter = 0; int index1 = 0; int index2 = line.indexOf('\t'); String token = null; words = new String[this.totalFields]; while (index2 >= 0) { token = line.substring(index1, index2); if ( StringUtils.isEmpty(token)) words[counter] = StringUtils.Empty; else words[counter] = token; counter++; index1 = index2 + 1; if ( index1 > line.length() - 1) break; index2 = line.indexOf('\t', index1); } if (index1 < line.length() - 1) words[counter] = line.substring(index1); if ( words[this.totalFieldsMinue1] == null ) words[this.totalFieldsMinue1] = StringUtils.Empty; records.add(words); int recordsT = records.size(); if ( recordsT >= betchSize ) { l.debug("\n Total records written" + readDocs); if ( ! insert(records) ) return; records.clear(); } } this.insert(records); records.clear(); } catch (Exception ex) { String msg = ""; if ( null != line ) msg = line + "\n"; if ( null != words) msg = msg + words.toString(); l.fatal("TabFileFetcher > " + msg, ex); throw new ApplicationFault(ex); } finally { try {if ( null != reader ) reader.close(); } catch (Exception ex) {l.warn("TabFileFetcher", ex);} try {if ( null != stream ) stream.close(); } catch (Exception ex) {l.warn("TabFileFetcher", ex);} } return; } private void initialize(String line) throws ApplicationFault { this.totalFieldsMinue1 = StringUtils.totalSighings(line, '\t'); this.totalFields = this.totalFieldsMinue1 + 1; this.fields = StringUtils.getStrings(line, "\t"); this.startTag = new String[this.totalFields]; this.endTag = new String[this.totalFields]; for ( int i=0; i< this.totalFieldsMinue1; i++ ) { this.startTag[i] = "<" + fields[i] + ">"; this.endTag[i] = "</" + fields[i] + ">"; } this.isTitle = null != this.titleFldNames; int titleCounter = 0; String commaSepTitles = null; if ( isTitle ) { this.titleIndex = new int[this.titleFldNames.length]; commaSepTitles = StringUtils.arrayToString(this.titleFldNames); } this.isPreview = null != this.previewFldNames; int previewCounter = 0; String commaSepPreviews = null; if ( isPreview ) { this.previewIndex = new int[this.previewFldNames.length]; commaSepPreviews = StringUtils.arrayToString(this.previewFldNames); } int counter = -1; for (String fld : this.fields) { counter++; //Id if ( this.idFldName.equals(fld) ) this.idIndex = counter; //Title if ( isTitle ) { if ( commaSepTitles.indexOf(fld) > -1 ) this.titleIndex[titleCounter++] = counter; } //Preview if ( isPreview ) { if (previewCounter < this.previewIndex.length && commaSepPreviews.indexOf(fld) > -1 ) this.previewIndex[previewCounter++] = counter; } } if ( this.idIndex == -1) throw new ApplicationFault("ID field is not known."); if ( isTitle ) this.titleIndexT = this.titleIndex.length; if ( isPreview ) this.previewIndexT = this.previewIndex.length; } /** * Insert * @param records * @param runPlan * @return * @throws Exception */ private boolean insert(List<String[]> records) throws Exception { bucket++; boolean indexNext = true; StringBuilder text = new StringBuilder(1024); StringBuilder title = null; if ( isTitle ) title = new StringBuilder(); StringBuilder xml = null; if ( isPreview ) xml = new StringBuilder(1024); List<HDocument> hdocs = new ArrayList<HDocument>(records.size()); for (String[] cols : records) { /** * Boundary check.. Start and end points */ readDocs++; if ( readDocs < this.startIndex) continue; if ( isEndIndex ) { if ( readDocs > this.endIndex) { indexNext = false; break; } } /** * Refresh and reuse the containers */ text.delete(0, text.capacity()); if ( isTitle ) title.delete(0, title.capacity()); if ( isPreview ) xml.delete(0, xml.capacity()); HDocument aDoc = new HDocument(); aDoc.docType = this.pristineDoc.docType; aDoc.url = this.pristineDoc.url; aDoc.eastering = this.pristineDoc.eastering; aDoc.team = this.pristineDoc.team; aDoc.editPermission = this.pristineDoc.editPermission; aDoc.ipAddress = this.pristineDoc.ipAddress; aDoc.locale = this.pristineDoc.locale; aDoc.northing = this.pristineDoc.northing; aDoc.securityHigh= this.pristineDoc.securityHigh; aDoc.sentimentPositive = this.pristineDoc.sentimentPositive; aDoc.viewPermission = this.pristineDoc.viewPermission; aDoc.key = cols[this.idIndex]; aDoc.fields = new ArrayList<Field>(this.totalFieldsMinue1); for ( int i=0; i< this.totalFieldsMinue1; i++ ) { if ( this.idIndex == i) continue; if ( StringUtils.isEmpty(cols[i]) ) continue; String colVal = StringUtils.encodeXml(cols[i]).toLowerCase(); if ( StringUtils.isEmpty(colVal) ) continue; aDoc.fields.add(new HField(this.fields[i], colVal)); if (l.isDebugEnabled()) l.debug( "Field:" + this.fields[i] + "-----" + colVal); text.append(colVal).append(' '); } aDoc.cacheText = text.toString().trim(); if ( isTitle) aDoc.title = buildTitle(title, cols); if ( isPreview ) aDoc.preview = this.buildPreview(xml, cols); hdocs.add(aDoc); } long s = System.currentTimeMillis(); System.out.println("Inserting :" + records.size() + ", done:" + readDocs); LuceneIndexManager.getInstance().insert(hdocs); long e = System.currentTimeMillis(); System.out.println("Total time taken :" + records.size() + " in ms " + (e -s)); return indexNext; } /** * Populate the title * @param title * @param cols * @return */ private String buildTitle(StringBuilder title, String[] cols) { String colVal = null; if ( 1 == this.titleIndexT) { return cols[this.titleIndex[0]]; } for (int titleI=0; titleI<titleIndexT; titleI++ ) { colVal = cols[this.titleIndex[titleI]]; if ( StringUtils.isEmpty(colVal) ) continue; title.append(colVal); title.append(' '); } return title.toString().trim(); } /** * Build the preview. * @param xml * @param cols * @return */ private String buildPreview( StringBuilder xml, String[] cols) { String colVal = null; xml.append(this.recordStartTag); int colIndex = 0; if ( 1 == this.previewIndexT) { colIndex = this.previewIndex[0]; if ( StringUtils.isEmpty(cols[colIndex])) return null; colVal = StringUtils.encodeXml(cols[colIndex]); if ( ! StringUtils.isEmpty(colVal) ) { xml.append(this.startTag[colIndex]); xml.append(colVal); xml.append(this.endTag[colIndex]); } } else { for (int previewI=0; previewI<previewIndexT; previewI++ ) { colIndex = this.previewIndex[previewI]; if ( StringUtils.isEmpty( cols[colIndex]) ) continue; colVal = StringUtils.encodeXml(cols[colIndex]); if ( StringUtils.isEmpty(colVal) ) continue; xml.append(this.startTag[colIndex]); xml.append(colVal); xml.append(this.endTag[colIndex]); } } xml.append(this.recordEndTag); return xml.toString(); } }