/*
* Copyright 2010 Bizosys Technologies Limited
*
* Licensed to the Bizosys Technologies Limited (Bizosys) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Bizosys licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.bizosys.hsearch;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.log4j.Logger;
import com.bizosys.hsearch.common.Account;
import com.bizosys.hsearch.common.Field;
import com.bizosys.hsearch.common.HDocument;
import com.bizosys.hsearch.common.HField;
import com.bizosys.hsearch.common.Account.AccountInfo;
import com.bizosys.hsearch.index.IndexWriter;
import com.bizosys.hsearch.util.FileReaderUtil;
import com.bizosys.hsearch.util.ObjectFactory;
import com.bizosys.hsearch.util.XmlRecordExtracter;
import com.bizosys.hsearch.util.XmlRecordExtracterCallback;
import com.bizosys.oneline.ApplicationFault;
import com.bizosys.oneline.SystemFault;
import com.bizosys.oneline.pipes.PipeIn;
import com.bizosys.oneline.util.StringUtils;
public class XmlFileCrawler implements XmlRecordExtracterCallback {
private static Logger l = Logger.getLogger(XmlFileCrawler.class.getName());
private int batchSize = 300;
public Map<String, String> startTag = null;
public Map<String,String> endTag = null;
public String recordStartTag = null;
public String recordEndTag = null;
public int startIndex = 0;
public int endIndex = -1;
public boolean isEndIndex = false;
public String docName = null;
public File aFile = null;
public boolean isTitle = false;
public int titleIndexT = 0;
public String idFldName = null;
public String[] titleFldNames = null;
public boolean isPreview = false;
public int previewIndexT = 0;
public String[] previewFldNames = null;
public String dictKeepField = null;
public String dictRecordType = null;
public String urlPrefix = null;
public String recordTag = null;
private HDocument pristineDoc = null;
private List<PipeIn> runPlan = null;
private List<Map<String,String>> records = null;
int readDocs = 0;
boolean isMultiWriter = false;
boolean firstTimeLoad = false;
long bucket = -1L;
String tenant = "anonymous";
AccountInfo acc = null;
/**
*
* @param filePath
* @param idFieldName
* @param titleFieldNamed
* @param plan
* @throws ApplicationFault
*/
public XmlFileCrawler(
String tenant, String filePath,
HDocument doc, String idFldName,
String[] titleFldNames,
String[] previewFields, List<PipeIn> plan,
int startIndex, int endIndex, int batchSize, boolean firstTimeLoad, long startBucket)
throws ApplicationFault, SystemFault {
if ( ! StringUtils.isEmpty(tenant)) this.tenant = tenant;
this.acc = Account.getAccount(this.tenant);
if ( null == acc) {
acc = new AccountInfo(this.tenant);
acc.name = this.tenant;
acc.maxbuckets = 1;
Account.storeAccount(acc);
}
this.pristineDoc = doc;
this.recordTag = this.pristineDoc.docType;
this.aFile = FileReaderUtil.getFile(filePath);
if ( !aFile.exists() ) {
throw new ApplicationFault(
"TabFileFetcher > File does not exist, " + this.aFile.getAbsolutePath());
}
if ( !aFile.canRead() ) {
throw new ApplicationFault(
"TabFileFetcher > Can't read the file" + this.aFile.getAbsolutePath());
}
String fileName = aFile.getName();
int dotAt = fileName.lastIndexOf('.');
String docName = ( dotAt == -1) ? fileName : fileName.substring(0, dotAt);
this.recordStartTag = "<" + docName + ">";
this.recordEndTag= "</" + docName + ">";
this.idFldName = idFldName;
this.titleFldNames = titleFldNames;
this.previewFldNames = previewFields;
if ( endIndex != -1) {
if ( startIndex >= endIndex ) throw new ApplicationFault(
"Not allowed as reading ends at " + endIndex);
}
this.startIndex = startIndex;
if ( endIndex != -1 && endIndex <= startIndex )
throw new ApplicationFault("Not allowed as reading starts from " + startIndex);
this.endIndex = endIndex;
if ( this.endIndex > 0 ) isEndIndex = true;
this.batchSize = batchSize;
this.runPlan = ( null == plan) ?
IndexWriter.getInstance().getInsertPipes() : plan;
this.firstTimeLoad = firstTimeLoad;
this.bucket = startBucket;
}
public void fetchAndIndex() throws ApplicationFault, SystemFault {
l.info("XmlFileFetcher > Loading " + this.aFile.getName());
this.initialize();
this.readDocs = -1;
InputStream stream = null;
try {
this.records = new ArrayList<Map<String,String>>(400);
XmlRecordExtracter xh = new XmlRecordExtracter(this.recordTag, this);
SAXParser sp = SAXParserFactory.newInstance().newSAXParser();
stream = new FileInputStream(this.aFile);
sp.parse(stream, xh);
this.processRecords(); //Handle the last remaining batch. All the other records wilb be processed through handleRecord method.
} catch (Exception ex) {
l.fatal("XmlFileFetcher > Error in indexing file " + this.aFile.getName(), ex);
throw new ApplicationFault(ex);
} finally {
try {
if (null != stream) stream.close();
} catch (Exception ex) {
l.warn("util.FileReaderUtil", ex);
}
}
return;
}
/**
* This method is called from the record extractor XmlRecordExtracter. So if processing needs to be stopped it has to be informed of the same.
*
*/
public boolean handleRecord(Map<String, String> fieldM) {
this.records.add(fieldM);
if (this.records.size() < this.batchSize) return true;
boolean status = this.processRecords();
for (Map<String, String> record : this.records) {
ObjectFactory.getInstance().putStringMap(record);
}
this.records.clear();
return status;
}
long start = 0;
long end = 0;
int total = 0;
StringBuilder reportSb = new StringBuilder(100);
private boolean processRecords() {
try {
start = System.currentTimeMillis();
boolean status = this.insert();
end = System.currentTimeMillis();
reportSb.append("\nTotal records written " );
reportSb.append(readDocs);
reportSb.append(" , in " );
int timeTaken = new Long(end - start).intValue();
reportSb.append(timeTaken/this.batchSize);
reportSb.append(" (ms)/rec and average, ");
total = total + timeTaken ;
reportSb.append(total/readDocs);
reportSb.append(" (ms)/rec with status, ");
reportSb.append(status);
reportSb.delete(0, reportSb.capacity());
return status;
} catch (Exception e){
e.printStackTrace();
return false;
}
}
private void initialize() throws ApplicationFault {
if (StringUtils.isEmpty(this.idFldName))
throw new ApplicationFault("ID field is not known.");
this.isTitle = (null != this.titleFldNames);
if (isTitle) this.titleIndexT = this.titleFldNames.length;
this.isPreview = (null != this.previewFldNames);
if (isPreview) {
this.startTag = new HashMap<String, String>();
this.endTag = new HashMap<String, String>();;
for (String previewField : this.previewFldNames) {
this.startTag.put(previewField, ("<" + previewField + ">"));
this.endTag.put(previewField, ("</" + previewField + ">"));
}
this.previewIndexT = this.previewFldNames.length;
}
}
static Long id = 0L;
static long batch = 0;
private boolean insert() throws Exception {
this.bucket++;
boolean indexNext = true;
StringBuilder text = new StringBuilder(1024);
StringBuilder title = null;
if ( isTitle ) title = new StringBuilder();
StringBuilder xml = null;
if ( isPreview ) xml = new StringBuilder(1024);
List<HDocument> hdocs = new ArrayList<HDocument>(records.size());
short docSerial = 1;
for (Map<String,String> cols : this.records) {
if (! cols.containsKey(this.idFldName)) {
if ( l.isInfoEnabled())
l.info("Missing ID field, skipping" + cols.keySet().toString());
cols.put(this.idFldName, id.toString());
id = id + 1;
continue;
}
readDocs++;
if ( readDocs < this.startIndex) continue;
if ( isEndIndex ) {
if ( readDocs > this.endIndex) {
indexNext = false;
break;
}
}
/**
* Refresh and reuse the containers
*/
text.delete(0, text.capacity());
if ( isTitle ) title.delete(0, title.capacity());
if ( isPreview ) xml.delete(0, xml.capacity());
HDocument aDoc = new HDocument();
aDoc.docType = this.pristineDoc.docType;
aDoc.url = this.pristineDoc.url;
aDoc.eastering = this.pristineDoc.eastering;
aDoc.team = this.pristineDoc.team;
aDoc.editPermission = this.pristineDoc.editPermission;
aDoc.ipAddress = this.pristineDoc.ipAddress;
aDoc.locale = this.pristineDoc.locale;
aDoc.northing = this.pristineDoc.northing;
aDoc.securityHigh= this.pristineDoc.securityHigh;
aDoc.sentimentPositive = this.pristineDoc.sentimentPositive;
aDoc.viewPermission = this.pristineDoc.viewPermission;
aDoc.key = cols.get(this.idFldName);
if ( this.firstTimeLoad) {
aDoc.bucketId = this.bucket;
aDoc.docSerialId = docSerial++;
}
cols.remove(this.idFldName);
if ( cols.size() > 0 ) aDoc.fields = new ArrayList<Field>(cols.size());
for (String fldName : cols.keySet()) {
String colVal = StringUtils.encodeXml(cols.get(fldName));
if ( StringUtils.isEmpty(colVal) ) continue;
aDoc.fields.add(new HField(fldName, colVal));
}
aDoc.cacheText = text.toString().trim();
if (l.isDebugEnabled()) l.debug("Tab body text:" + aDoc.cacheText);
if ( isTitle) aDoc.title = buildTitle(title, cols);
if ( isPreview ) aDoc.preview = this.buildPreview(xml, cols);
hdocs.add(aDoc);
}
int amount = this.records.size();
long s = System.currentTimeMillis();
System.out.println("Incrementing : " + batch + " .. " + amount);
batch = batch + amount;
IndexWriter.getInstance().insertBatch(hdocs, this.acc, runPlan,isMultiWriter);
long e = System.currentTimeMillis();
System.out.println("Incremented For :" + this.records.size() + " in ms " + (e -s));
return indexNext;
}
/**
* Populate the title
*
* @param title
* @param cols
* @param content
*/
private String buildTitle(StringBuilder title, Map<String,String> fieldM) {
String colVal = null;
if (1 == this.titleIndexT) {
return (fieldM.get(this.titleFldNames[0]));
} else {
for (int titleI = 0; titleI < this.titleIndexT; titleI++) {
colVal = fieldM.get(this.titleFldNames[titleI]);
if (StringUtils.isEmpty(colVal)) continue;
title.append(colVal);
title.append(' ');
}
return title.toString().trim();
}
}
/**
* Populate the Preview
*/
private String buildPreview(StringBuilder xml,
Map<String,String> fieldM) {
String colVal = null;
xml.append(this.recordStartTag);
for (int previewI = 0; previewI < previewIndexT; previewI++) {
colVal = fieldM.get(this.previewFldNames[previewI]);
if (StringUtils.isEmpty(colVal)) continue;
xml.append(this.startTag.get(this.previewFldNames[previewI]));
xml.append(colVal);
xml.append(this.endTag.get(this.previewFldNames[previewI]));
}
xml.append(this.recordEndTag);
return xml.toString();
}
public boolean isMultiWriter() {
return isMultiWriter;
}
public void setMultiWriter(boolean isMultiWriter) {
this.isMultiWriter = isMultiWriter;
}
}