/** * Copyright Plugtree LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.plugtree.solrmeter.model.extractor; import java.util.LinkedList; import java.util.List; import org.apache.log4j.Logger; import org.apache.solr.common.SolrInputDocument; import com.plugtree.solrmeter.model.FileUtils; import com.plugtree.solrmeter.model.InputDocumentExtractor; /** * Extracts documents from text files. The file must be fieldName:fieldValue;fieldName2:fieldValue2;... * and all required fields must be added. * if a ";" character is part of some value (and it is not a field separator) it must be escaped with a "\" character. * All "\" on a value must be escaped as "\\" * @see com.plugtree.solrmeter.extractor.FileInputDocumentExtractorTestCase.testEscapedChars() * @author tflobbe * */ public class FileInputDocumentExtractor implements InputDocumentExtractor { private final static Logger logger = Logger.getLogger(FileInputDocumentExtractor.class); /** * The list of extracted documents */ protected List<SolrInputDocument> documents; public FileInputDocumentExtractor(String inputFilePath) { super(); documents = new LinkedList<SolrInputDocument>(); loadDocuments(inputFilePath); } /** * Loads all documents from text file */ protected void loadDocuments(String inputFilePath) { List<String> documentStrings = FileUtils.loadStringsFromFile(inputFilePath); documents = this.createDocumentList(documentStrings); } private List<SolrInputDocument> createDocumentList(List<String> documentsStrings) { List<SolrInputDocument> list = new LinkedList<SolrInputDocument>(); for(String documentString:documentsStrings) { list.add(this.createSolrDocument(documentString)); } return list; } private SolrInputDocument createSolrDocument(String documentString) { SolrInputDocument document = new SolrInputDocument(); List<String> fields = this.split(documentString); try { for(String field:fields) { try { int idx = field.indexOf(":"); document.addField(field.substring(0, idx), field.substring(idx + 1)); }catch(RuntimeException e) { logger.error("Error Loading documents, on field " + field); throw e; } } } catch(RuntimeException e) { logger.error("Error Loading documents, on document line: " + documentString); throw e; } return document; } private List<String> split(String documentString) { List<String> strings = new LinkedList<String>(); int lastSplitIndex = 0; int nextSplitIndex; while(lastSplitIndex < documentString.length()) { nextSplitIndex = findNextSplitIndex(documentString, lastSplitIndex); String splittedString = documentString.substring(lastSplitIndex, nextSplitIndex); strings.add(removeEscapeCharacters(splittedString)); lastSplitIndex = nextSplitIndex + 1; } return strings; } private String removeEscapeCharacters(String splittedString) { return splittedString.replaceAll("\\\\;", ";").replaceAll("\\\\\\\\", "\\\\"); } /** * Returns the next Index to Split the String * @param documentString * @param lastSplitIndex * @return */ private int findNextSplitIndex(String documentString, int lastSplitIndex) { for(int i = lastSplitIndex; i < documentString.length(); i++) { if(documentString.charAt(i) == '\\') { if(documentString.charAt(i + 1) == '\\' || documentString.charAt(i + 1) == ';') { i++; } }else { if(documentString.charAt(i) == ';') { return i; } } } return documentString.length(); } @Override public SolrInputDocument getRandomDocument() { return (SolrInputDocument) FileUtils.getNextRandomObject(documents); } }