package com.bizosys.hsearch.loader; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.ArrayList; import java.util.List; import com.bizosys.hsearch.common.Field; import com.bizosys.hsearch.common.HDocument; import com.bizosys.hsearch.common.SField; import com.bizosys.hsearch.common.Account.AccountInfo; import com.bizosys.hsearch.index.IndexWriter; import com.bizosys.hsearch.loader.csv.CsvWriter; import com.bizosys.oneline.ApplicationFault; import com.bizosys.oneline.SystemFault; import com.bizosys.oneline.pipes.PipeIn; import com.bizosys.oneline.util.StringUtils; public class RowEventProcessorHSearch implements RowEventProcessor { /** * About the User */ private AccountInfo acc = null; /** * About the document */ private String[] headings = null; private String idPrefix= null; private int idFldColumn= -1; private int urlFldColumn= -1; private int weightFldColumn = -1; private int idAutoIncrement = 1; private int[] titleColumns = null; private int keywordColumn = -1; private int[] previewColumns = null; private int[] descFldColumns = null; private int[] indexableColumns = null; private String documentType = null; /** * About Indexing with other meta info */ private HDocument pristineDoc = null; private List<PipeIn> runPlan = null; private boolean isXmlPreview = true; /** * Prebuilt variables */ private String recordStartTag = null; private String recordEndTag = null; private int indexableColsTotal = 0; private int titleColsTotal = 0; private int previewColsTotal = 0; private int descColsTotal = 0; private CsvWriter csvWriter = new CsvWriter(','); private StringBuilder descriptionBuilder = new StringBuilder(512); /** * Indexing Cursor Details */ private int startIndex = 0; private int endIndex = -1; boolean isEndIndex = false; int readDocs = 0; private List<String[]> rows = new ArrayList<String[]>(); private List<HDocument> hdocs = new ArrayList<HDocument>(); private int betchSize = 300; private Writer writer = null; private String lineBreak = null; public RowEventProcessorHSearch( AccountInfo acc, HDocument pristineDoc, List<PipeIn> plan, String idPrefix, int idFldColumn, int urlFldColumn, int weightFldColumn, int[] titleFldColumns, int keywordColumn, int[] previewFldColumns, int[] descFldColumns, String documentType, int[] indexFldColumns, int startIndex, int endIndex, int batchSize, boolean isXmlPreview, Writer writer, String lineBreak) throws ApplicationFault, SystemFault { this.acc = acc; this.pristineDoc = pristineDoc; this.runPlan = ( null == plan) ? IndexWriter.getInstance().getInsertPipes() : plan; this.idPrefix = idPrefix; if ( StringUtils.isEmpty(this.idPrefix)) this.idPrefix = StringUtils.Empty; this.idFldColumn = idFldColumn; this.urlFldColumn = urlFldColumn; this.weightFldColumn = weightFldColumn; this.titleColumns = titleFldColumns; this.titleColsTotal = ( null == titleFldColumns) ? 0 : titleFldColumns.length; this.keywordColumn = keywordColumn; this.previewColumns = previewFldColumns; previewColsTotal = ( null == previewColumns) ? 0 : previewColumns.length; this.descFldColumns = descFldColumns; descColsTotal = ( null == descFldColumns) ? 0 : descFldColumns.length; this.indexableColumns = indexFldColumns; this.indexableColsTotal = ( null == indexFldColumns ) ? 0 : indexFldColumns.length; if ( LoaderLog.l.isDebugEnabled() ) LoaderLog.l.debug( "Total Indexable Columns: " + this.indexableColsTotal); this.documentType = documentType; this.recordStartTag = "<" + this.documentType + ">"; this.recordEndTag= "</" + this.documentType + ">"; if ( endIndex != -1) { if ( startIndex >= endIndex ) throw new ApplicationFault( "Not allowed as reading ends at " + endIndex); } this.startIndex = startIndex; if ( endIndex != -1 && endIndex <= startIndex ) throw new ApplicationFault("Not allowed as reading starts from " + startIndex); this.endIndex = endIndex; if ( this.endIndex > 0 ) isEndIndex = true; this.betchSize = batchSize; this.isXmlPreview = isXmlPreview; this.writer = writer; if ( null == writer) { try { this.writer = new OutputStreamWriter(System.out, "UTF-8"); } catch (UnsupportedEncodingException ex) { //This will never happen LoaderLog.l.warn( "OutputStream encoding issues", ex); } } this.lineBreak = lineBreak; } public void onHeaderRow(String[] cells) throws ApplicationFault, SystemFault { int totalCells = ( null == cells ) ? 0 : cells.length; if ( 0 == totalCells) throw new ApplicationFault("There is no header row"); for (String cell : cells) { if ( cell.length() > 24 ) { throw new ApplicationFault("In Appropriate Header.. More than 16 character.\n" + StringUtils.arrayToString(cells, '|')); } } this.readDocs++; this.headings = cells; } public void onDataRow(String[] cells) throws ApplicationFault, SystemFault { this.readDocs++; if ( this.readDocs < this.startIndex) return; //Not reached yet if ( isEndIndex ) if ( this.readDocs > this.endIndex) return; // Already done this.rows.add(cells); if ( this.rows.size() >= this.betchSize) { insert(this.rows, this.runPlan); this.rows.clear(); } } public void onEnd() throws ApplicationFault, SystemFault { //Flush the rest if ( this.rows.size() > 0 ) { insert(this.rows, this.runPlan); this.rows.clear(); } } /** * Insert * @param records * @param runPlan * @return * @throws Exception */ private void insert(List<String[]> rows, List<PipeIn> runPlan ) throws ApplicationFault, SystemFault { StringBuilder text = new StringBuilder(1024); StringBuilder title = new StringBuilder(); StringBuilder preview = new StringBuilder(1024); hdocs.clear(); for (String[] cells : rows) { text.delete(0, text.capacity()); title.delete(0, title.capacity()); preview.delete(0, preview.capacity()); HDocument aDoc = new HDocument(); aDoc.docType = this.pristineDoc.docType; aDoc.url = this.pristineDoc.url; aDoc.eastering = this.pristineDoc.eastering; aDoc.team = this.pristineDoc.team; aDoc.editPermission = this.pristineDoc.editPermission; aDoc.ipAddress = this.pristineDoc.ipAddress; aDoc.locale = this.pristineDoc.locale; aDoc.northing = this.pristineDoc.northing; aDoc.securityHigh= this.pristineDoc.securityHigh; aDoc.sentimentPositive = this.pristineDoc.sentimentPositive; aDoc.viewPermission = this.pristineDoc.viewPermission; if ( -1 == this.idFldColumn) { if ( null == this.idPrefix) aDoc.key = new Integer(this.idAutoIncrement++).toString(); else aDoc.key = this.idPrefix + (this.idAutoIncrement++); } else { if ( StringUtils.isEmpty(cells[this.idFldColumn]) ) { String msg = "\nEmpty Id.. Skipping > " + StringUtils.arrayToString(cells, '|'); try { this.writer.write(msg); } catch (IOException ex) { throw new ApplicationFault(ex); } continue; } aDoc.key = this.idPrefix + cells[this.idFldColumn]; } if ( -1 != this.urlFldColumn) { if ( ! StringUtils.isEmpty(cells[this.urlFldColumn]) ) { aDoc.url = cells[this.urlFldColumn]; } } if ( ! StringUtils.isEmpty(cells[this.weightFldColumn]) ) { aDoc.weight = new Integer(cells[this.weightFldColumn]).intValue(); } if ( this.keywordColumn > -1) { if ( this.keywordColumn >= cells.length ) { throw new ApplicationFault( "Keyword column does not exist : " + cells.length + "/" + this.keywordColumn); } String keywords = cells[this.keywordColumn]; aDoc.tags = tagSplit(keywords, " , "); if ( LoaderLog.l.isDebugEnabled()) { LoaderLog.l.debug("Keyword:" + aDoc.tags.toString()); } } if ( indexableColsTotal > 0) { aDoc.fields = new ArrayList<Field>(indexableColsTotal); for (int columnNumber : indexableColumns) { if ( StringUtils.isEmpty(cells[columnNumber]) ) continue; aDoc.fields.add(new SField(headings[columnNumber], cells[columnNumber]) ); } } if ( titleColsTotal > 0) { if ( titleColsTotal > 1 ) aDoc.title = buildTitle(title, cells); else aDoc.title = cells[this.titleColumns[0]]; } if ( previewColsTotal > 0) { if ( previewColsTotal > 1 ) aDoc.preview = this.buildPreview(preview, cells); else aDoc.preview = cells[this.previewColumns[0]]; if ( null != this.lineBreak) { if ( aDoc.preview.indexOf(this.lineBreak) >= 0) { aDoc.preview = aDoc.preview.replace(this.lineBreak, "\n"); } } } if ( descColsTotal > 0) { if ( descColsTotal > 1 ) aDoc.cacheText = this.buildDescription(cells); else aDoc.cacheText = cells[this.descFldColumns[0]]; if ( null != this.lineBreak) { if ( aDoc.cacheText.indexOf(this.lineBreak) >= 0) { aDoc.cacheText = aDoc.cacheText.replace(this.lineBreak, "\n"); } } } hdocs.add(aDoc); } long s = System.currentTimeMillis(); try { if ( LoaderLog.l.isInfoEnabled() ) { LoaderLog.l.info("\nInserting> End Row Id :" + readDocs + ", Batch Size:" + hdocs.size()); } this.writer.flush(); IndexWriter.getInstance().insertBatch(hdocs, this.acc, runPlan, true); if ( LoaderLog.l.isInfoEnabled() ) { long e = System.currentTimeMillis(); LoaderLog.l.info("\nInsered> " + (e -s) + "ms"); } } catch (Exception ex) { String msg = "Failed where> End Row Id :" + readDocs + ", Batch Size:" + hdocs.size(); try { this.writer.write(msg); } catch (Exception iex){}; LoaderLog.l.fatal(ex); throw new ApplicationFault(ex); } } /** * Populate the title * @param title * @param cols * @return */ private String buildTitle(StringBuilder title, String[] cells) { String colVal = null; if ( 1 == this.titleColsTotal) { return cells[this.titleColumns[0]]; } for (int aTitleCol : this.titleColumns) { colVal = cells[aTitleCol]; if ( StringUtils.isEmpty(colVal) ) continue; title.append(colVal); title.append(", "); } return title.toString().trim(); } /** * Build the preview. * @param xml * @param cols * @return */ private String buildPreview( StringBuilder preview, String[] cells) { String cellVal = null; String colName = null; if ( isXmlPreview ) { preview.append(this.recordStartTag); for (int colIndex : this.previewColumns) { cellVal = cells[colIndex]; if ( StringUtils.isEmpty( cellVal) ) continue; colName = headings[colIndex]; preview.append('<').append(colName).append('>'); preview.append(cells[colIndex]); preview.append("</").append(colName).append('>'); } preview.append(this.recordEndTag); return preview.toString(); } else { return csvWriter.writeRow(cells, preview); } } /** * Build the description. * @param xml * @param cols * @return */ private String buildDescription( String[] cells) { String cellVal = null; for (int colIndex : this.descFldColumns) { cellVal = cells[colIndex]; if ( StringUtils.isEmpty( cellVal) ) continue; descriptionBuilder.append(cellVal).append(' '); } String res = descriptionBuilder.toString(); descriptionBuilder.delete(0, descriptionBuilder.capacity()); return res; } private static List<String> tagSplit(final String text, String separator) { if ( null == text)return null; if (text.length() == 0 ) return null; int separatorLen = separator.length(); final List<String> result = new ArrayList<String>(); int index1 = 0; int index2 = text.indexOf(separator); String token = null; while (index2 >= 0) { token = text.substring(index1, index2); result.add(token); index1 = index2 + separatorLen; index2 = text.indexOf(separator, index1); } if (index1 < text.length() - 1) { result.add(text.substring(index1)); } return result; } }