/** * Copyright (c) 2008-2010 Mark Logic Corporation. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * The use of the Apache License does not indicate that this project is * affiliated with the Apache Software Foundation. */ package com.marklogic.recordloader.xcc; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import com.marklogic.ps.Utilities; import com.marklogic.ps.timing.TimedEvent; import com.marklogic.recordloader.Configuration; import com.marklogic.recordloader.FatalException; import com.marklogic.recordloader.LoaderException; import com.marklogic.recordloader.TranscodingLoader; /** * @author Michael Blakeley, michael.blakeley@marklogic.com * */ public class DelimitedDataLoader extends TranscodingLoader { DelimitedDataConfiguration config; private String recordName; private String idName; private String fieldDelimiter; private int lineNumber; private boolean isFatalErrors; private String fields[]; private String[] labels; private int labelIndex; private String charsetName; /* * (non-Javadoc) * * @see com.marklogic.recordloader.AbstractLoader#process() */ @SuppressWarnings("unused") public void process() throws LoaderException { super.process(); logger.fine("starting with decoder = " + decoder); if (null != decoder) { charsetName = decoder.charset().name(); logger.fine("using " + charsetName); } if (!(super.config instanceof DelimitedDataConfiguration)) { throw new FatalException( Configuration.CONFIGURATION_CLASSNAME_KEY + " must be set to " + DelimitedDataConfiguration.class.getName()); } config = (DelimitedDataConfiguration) super.config; fieldDelimiter = config.getFieldDelimiter(); idName = config.getIdNodeName(); recordName = config.getRecordName(); isFatalErrors = config.isFatalErrors(); boolean downcaseLabels = config.isDowncaseLabels(); BufferedReader br = new BufferedReader(new InputStreamReader( input, decoder)); String line; String id; lineNumber = 0; labelIndex = 0; if (downcaseLabels) { recordName = recordName.toLowerCase(); } try { // first line contains the labels line = br.readLine(); lineNumber++; labels = line.split(fieldDelimiter); // match the configured idName with the input labels for (int i = 0; i < labels.length; i++) { if (downcaseLabels) { labels[i] = labels[i].toLowerCase(); } if (idName.equals(labels[i])) { labelIndex = i; // do not exit loop - must downcase remaining labels } } logger.info("found labels " + labels.length); while (null != (line = br.readLine())) { String xml = null; // line-by-line, so we can move on after errors try { xml = handleRecord(line); event.stop(); } catch (Exception e) { if (isFatalErrors) { throw new FatalException(e); } event.stop(true); logger.logException(e); } finally { updateMonitor((null != xml) ? xml.length() : 0); cleanupRecord(); } } } catch (Exception e) { if (isFatalErrors) { throw new FatalException(e); } event.stop(true); logger.logException(e); } finally { try { br.close(); } catch (IOException e) { // no point in doing anything... logger.logException(e); } cleanupInput(event.isError()); } } /** * @param line * @return * @throws LoaderException * @throws IOException */ private String handleRecord(String line) throws LoaderException, IOException { String xml; String id; event = new TimedEvent(); lineNumber++; // TODO this is too simplistic for CSV with quoted values // by default, split() discards empty strings fields = line.split(fieldDelimiter, labels.length); // sanity check if (fields.length != labels.length) { String msg = "document mismatch:" + " fields=" + fields.length + ", labels=" + labels.length + " at " + ((null == currentRecordPath) ? "stdin" : currentRecordPath) + ":" + lineNumber + ": " + line; // caller will decide if this is fatal or not throw new LoaderException(msg); } id = fields[labelIndex]; currentUri = composeUri(id); content = contentFactory.newContent(currentUri); boolean skippingRecord = checkIdAndUri(currentRecordPath); xml = getXml(labels, fields); if (null != xml && !skippingRecord) { // write the xml // NB - getBytes will return the default-encoding bytes content.setBytes(null == decoder ? xml.getBytes() : xml .getBytes(charsetName)); insert(); } return xml; } /** * @param labels * @param fields * @return */ private String getXml(String[] labels, String[] fields) { // build the xml StringBuilder xml = new StringBuilder("<" + recordName + ">"); for (int i = 0; i < labels.length; i++) { xml.append("<" + labels[i] + ">" + Utilities.escapeXml(fields[i]) + "</" + labels[i] + ">"); } xml.append("</" + recordName + ">"); return xml.toString(); } }