XmlFileCrawler.java example

Explorer
hsearch-obsolete-master
- src
/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.Logger;

import com.bizosys.hsearch.common.Account;
import com.bizosys.hsearch.common.Field;
import com.bizosys.hsearch.common.HDocument;
import com.bizosys.hsearch.common.HField;
import com.bizosys.hsearch.common.Account.AccountInfo;
import com.bizosys.hsearch.index.IndexWriter;
import com.bizosys.hsearch.util.FileReaderUtil;
import com.bizosys.hsearch.util.ObjectFactory;
import com.bizosys.hsearch.util.XmlRecordExtracter;
import com.bizosys.hsearch.util.XmlRecordExtracterCallback;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.pipes.PipeIn;
import com.bizosys.oneline.util.StringUtils;

public class XmlFileCrawler implements XmlRecordExtracterCallback {
	
	private static Logger l = Logger.getLogger(XmlFileCrawler.class.getName());
	
	private int batchSize = 300;

	public Map<String, String> startTag = null;
	public Map<String,String> endTag = null;
	public String recordStartTag = null;
	public String recordEndTag = null;

	public int startIndex = 0;
	public int endIndex = -1;
	public boolean isEndIndex = false;

	public String docName = null;

	public File aFile = null;
	public boolean isTitle = false;
	public int titleIndexT = 0;
	public String idFldName = null;
	public String[] titleFldNames = null;
	public boolean isPreview = false;
	public int previewIndexT = 0;
	public String[] previewFldNames = null;
	public String dictKeepField = null;
	public String dictRecordType = null;
	public String urlPrefix = null;
	public String recordTag = null;

	private HDocument pristineDoc = null;
	private List<PipeIn> runPlan = null;
	
	private List<Map<String,String>> records = null;
	int readDocs = 0;
	boolean isMultiWriter = false;
	boolean firstTimeLoad = false;
	
	long bucket = -1L;
	
	String tenant = "anonymous";
	AccountInfo acc = null;
	
	/**
	 * 
	 * @param filePath
	 * @param idFieldName
	 * @param titleFieldNamed
	 * @param plan
	 * @throws ApplicationFault
	 */
	public XmlFileCrawler(
			String tenant, String filePath,
			HDocument doc, String idFldName, 
			String[] titleFldNames,
			String[] previewFields,  List<PipeIn> plan, 
			int startIndex, int endIndex, int batchSize, boolean firstTimeLoad, long startBucket) 
		throws ApplicationFault, SystemFault {
			if ( ! StringUtils.isEmpty(tenant)) this.tenant = tenant;
			
			this.acc = Account.getAccount(this.tenant);
			if ( null == acc) {
				acc = new AccountInfo(this.tenant);
				acc.name = this.tenant;
				acc.maxbuckets = 1;
				Account.storeAccount(acc);
			}
			
			this.pristineDoc = doc;
			this.recordTag = this.pristineDoc.docType; 
			this.aFile = FileReaderUtil.getFile(filePath);
			if ( !aFile.exists() ) {
				throw new ApplicationFault(
					"TabFileFetcher > File does not exist, " + this.aFile.getAbsolutePath());
			}
			if ( !aFile.canRead() ) {
				throw new ApplicationFault(
					"TabFileFetcher > Can't read the file" + this.aFile.getAbsolutePath());
			}
			
	    	String fileName = aFile.getName();
	    	int dotAt = fileName.lastIndexOf('.');
	    	String docName = ( dotAt == -1) ? fileName : fileName.substring(0, dotAt);
	    	
	    	this.recordStartTag = "<" + docName + ">"; 
	    	this.recordEndTag= "</" + docName + ">";
	    	
	    	this.idFldName = idFldName;
	    	this.titleFldNames = titleFldNames;
	    	this.previewFldNames = previewFields;
		
			if ( endIndex  != -1) {
				if ( startIndex >= endIndex ) throw new ApplicationFault(
					"Not allowed as reading ends at " + endIndex);
			}

			this.startIndex = startIndex;
			if ( endIndex != -1 && endIndex <= startIndex ) 
				throw new ApplicationFault("Not allowed as reading starts from " + startIndex);

			this.endIndex = endIndex;
			if ( this.endIndex > 0 ) isEndIndex = true;
		
			this.batchSize = batchSize;
			this.runPlan = ( null == plan) ? 
				IndexWriter.getInstance().getInsertPipes() : plan;
			this.firstTimeLoad = firstTimeLoad;
			this.bucket = startBucket;
	}


	public void fetchAndIndex() throws ApplicationFault, SystemFault {
		
		l.info("XmlFileFetcher > Loading " + this.aFile.getName());
		this.initialize();
		
		this.readDocs = -1;
		InputStream stream = null;
		try {
			
			this.records = new ArrayList<Map<String,String>>(400);
			XmlRecordExtracter xh = new XmlRecordExtracter(this.recordTag, this);
			SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
			stream = new FileInputStream(this.aFile);
			sp.parse(stream, xh);
			this.processRecords(); //Handle the last remaining batch. All the other records wilb be processed through handleRecord method. 
		
		} catch (Exception ex) {
			l.fatal("XmlFileFetcher > Error in indexing file " + this.aFile.getName(), ex);
			throw new ApplicationFault(ex);
		
		} finally {
			try {
				if (null != stream) stream.close();
			}  catch (Exception ex) {
				l.warn("util.FileReaderUtil", ex);
			}
		}
		return;
	}

	/**
	 * This method is called from the record extractor XmlRecordExtracter. So if processing needs to be stopped it has to be informed of the same.
	 * 
	 */
	public boolean handleRecord(Map<String, String> fieldM) {
		this.records.add(fieldM);
		if (this.records.size() < this.batchSize) return true;
		boolean status = this.processRecords();
		for (Map<String, String> record : this.records) {
			ObjectFactory.getInstance().putStringMap(record);
		}
		this.records.clear();
		return status;
	}

	long start = 0;
	long end = 0;
	int total = 0;
	StringBuilder reportSb = new StringBuilder(100);
	
	private boolean processRecords() {
		try {
			start = System.currentTimeMillis();
			
			boolean status = this.insert();
	
			end = System.currentTimeMillis();
			reportSb.append("\nTotal records written " );
			reportSb.append(readDocs);
			reportSb.append(" , in " );
			int timeTaken = new Long(end - start).intValue();
			reportSb.append(timeTaken/this.batchSize);
			reportSb.append(" (ms)/rec and average, ");
			total = total + timeTaken ;
			reportSb.append(total/readDocs);
			reportSb.append(" (ms)/rec with status, ");
			reportSb.append(status);
			reportSb.delete(0, reportSb.capacity());
			return status;
		
		} catch (Exception e){
			e.printStackTrace();
			return false;
		}
	}

	private void initialize() throws ApplicationFault {
		
		if (StringUtils.isEmpty(this.idFldName))  
			throw new ApplicationFault("ID field is not known.");
		this.isTitle = (null != this.titleFldNames);
		if (isTitle) this.titleIndexT = this.titleFldNames.length;
		
		this.isPreview = (null != this.previewFldNames);
		if (isPreview) {
			this.startTag = new HashMap<String, String>();
			this.endTag = new HashMap<String, String>();;
			for (String previewField : this.previewFldNames) {
				this.startTag.put(previewField, ("<" + previewField + ">"));
				this.endTag.put(previewField, ("</" + previewField + ">"));
			}
			this.previewIndexT = this.previewFldNames.length;
		}
	}

	static Long id = 0L;
	static long batch = 0;
	
	private boolean insert() throws Exception {
		this.bucket++;
		boolean indexNext = true;
		StringBuilder text = new StringBuilder(1024);
		StringBuilder title = null;
		if ( isTitle ) title = new StringBuilder();
		StringBuilder xml = null;
		if ( isPreview ) xml = new StringBuilder(1024);
		
		List<HDocument> hdocs = new ArrayList<HDocument>(records.size());
		short docSerial = 1;
		for (Map<String,String> cols : this.records) {
			
			if (! cols.containsKey(this.idFldName)) {
				if ( l.isInfoEnabled())
					l.info("Missing ID field, skipping" + cols.keySet().toString());
				cols.put(this.idFldName, id.toString());
				id = id + 1;
				continue;
			}
			
			readDocs++;
			if ( readDocs < this.startIndex) continue;
			if ( isEndIndex ) {
				if ( readDocs > this.endIndex) {
					indexNext = false;
					break;
				}
			}

			/**
			 * Refresh and reuse the containers
			 */
			text.delete(0, text.capacity());
			if ( isTitle ) title.delete(0, title.capacity());
			if ( isPreview ) xml.delete(0, xml.capacity());
			
			HDocument aDoc = new HDocument();
			aDoc.docType = this.pristineDoc.docType;
			aDoc.url = this.pristineDoc.url;
			aDoc.eastering = this.pristineDoc.eastering;
			aDoc.team = this.pristineDoc.team;
			aDoc.editPermission = this.pristineDoc.editPermission;
			aDoc.ipAddress = this.pristineDoc.ipAddress;
			aDoc.locale = this.pristineDoc.locale;
			aDoc.northing = this.pristineDoc.northing;
			aDoc.securityHigh= this.pristineDoc.securityHigh;
			aDoc.sentimentPositive = this.pristineDoc.sentimentPositive;
			aDoc.viewPermission = this.pristineDoc.viewPermission;

			aDoc.key = cols.get(this.idFldName);
			
			if ( this.firstTimeLoad) {
				aDoc.bucketId = this.bucket;
				aDoc.docSerialId = docSerial++;
			}
			
			cols.remove(this.idFldName);
			if ( cols.size() > 0 ) aDoc.fields = new ArrayList<Field>(cols.size());
			for (String fldName : cols.keySet()) {
				String colVal = StringUtils.encodeXml(cols.get(fldName));
				if ( StringUtils.isEmpty(colVal) ) continue;
				aDoc.fields.add(new HField(fldName, colVal));
			}
			
			aDoc.cacheText = text.toString().trim();
			if (l.isDebugEnabled()) l.debug("Tab body text:" + aDoc.cacheText);
			if ( isTitle) aDoc.title = buildTitle(title, cols);
			if ( isPreview ) aDoc.preview = this.buildPreview(xml, cols);
			hdocs.add(aDoc);
		}
		int amount = this.records.size();
		long s = System.currentTimeMillis();
		System.out.println("Incrementing : " + batch + " .. " + amount);
		batch = batch + amount;
		IndexWriter.getInstance().insertBatch(hdocs, this.acc, runPlan,isMultiWriter);
		long e = System.currentTimeMillis();
		System.out.println("Incremented For :" + this.records.size() + " in ms " + (e -s));
		return indexNext;
	}

	/**
	 * Populate the title
	 * 
	 * @param title
	 * @param cols
	 * @param content
	 */
	private String buildTitle(StringBuilder title, Map<String,String> fieldM) {
		
		String colVal = null;
		if (1 == this.titleIndexT) {
			return (fieldM.get(this.titleFldNames[0]));
		
		}  else {
			for (int titleI = 0; titleI < this.titleIndexT; titleI++) {
				colVal = fieldM.get(this.titleFldNames[titleI]);
				if (StringUtils.isEmpty(colVal)) continue;
				title.append(colVal);
				title.append(' ');
			}
			return title.toString().trim();
		}
	}

	/**
	 * Populate the Preview
	 */
	private String buildPreview(StringBuilder xml, 
		Map<String,String> fieldM) {
		
		String colVal = null;
		xml.append(this.recordStartTag);

		for (int previewI = 0; previewI < previewIndexT; previewI++) {
			
			colVal = fieldM.get(this.previewFldNames[previewI]);
			if (StringUtils.isEmpty(colVal)) continue;
			xml.append(this.startTag.get(this.previewFldNames[previewI]));
			xml.append(colVal);
			xml.append(this.endTag.get(this.previewFldNames[previewI]));
			
		}

		xml.append(this.recordEndTag);
		return xml.toString();
	}

	public boolean isMultiWriter() {
		return isMultiWriter;
	}

	public void setMultiWriter(boolean isMultiWriter) {
		this.isMultiWriter = isMultiWriter;
	}
}