package org.apache.lucene.benchmark.byTask.tasks; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.FileOutputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; /** * A task which writes documents, one line per document. Each line is in the * following format: title <TAB> date <TAB> body. The output of this * task can be consumed by * {@link org.apache.lucene.benchmark.byTask.feeds.LineDocSource} and is intended * to save the IO overhead of opening a file per document to be indexed.<br> * Supports the following parameters: * <ul> * <li>line.file.out - the name of the file to write the output to. That * parameter is mandatory. <b>NOTE:</b> the file is re-created. * <li>bzip.compression - whether the output should be bzip-compressed. This is * recommended when the output file is expected to be large. (optional, default: * false). * </ul> * <b>NOTE:</b> this class is not thread-safe and if used by multiple threads the * output is unspecified (as all will write to the same output file in a * non-synchronized way). */ public class WriteLineDocTask extends PerfTask { public final static char SEP = '\t'; private int docSize = 0; private PrintWriter lineFileOut = null; private DocMaker docMaker; private ThreadLocal<StringBuilder> threadBuffer = new ThreadLocal<StringBuilder>(); private ThreadLocal<Matcher> threadNormalizer = new ThreadLocal<Matcher>(); public WriteLineDocTask(PerfRunData runData) throws Exception { super(runData); Config config = runData.getConfig(); String fileName = config.get("line.file.out", null); if (fileName == null) { throw new IllegalArgumentException("line.file.out must be set"); } OutputStream out = new FileOutputStream(fileName); boolean doBzipCompression = false; String doBZCompress = config.get("bzip.compression", null); if (doBZCompress != null) { // Property was set, use the value. doBzipCompression = Boolean.valueOf(doBZCompress).booleanValue(); } else { // Property was not set, attempt to detect based on file's extension doBzipCompression = fileName.endsWith("bz2"); } if (doBzipCompression) { // Wrap with BOS since BZip2CompressorOutputStream calls out.write(int) // and does not use the write(byte[]) version. This proved to speed the // compression process by 70% ! out = new BufferedOutputStream(out, 1 << 16); out = new CompressorStreamFactory().createCompressorOutputStream("bzip2", out); } lineFileOut = new PrintWriter(new BufferedWriter(new OutputStreamWriter(out, "UTF-8"), 1 << 16)); docMaker = runData.getDocMaker(); } @Override protected String getLogMessage(int recsCount) { return "Wrote " + recsCount + " line docs"; } @Override public int doLogic() throws Exception { Document doc = docSize > 0 ? docMaker.makeDocument(docSize) : docMaker.makeDocument(); Matcher matcher = threadNormalizer.get(); if (matcher == null) { matcher = Pattern.compile("[\t\r\n]+").matcher(""); threadNormalizer.set(matcher); } Field f = doc.getField(DocMaker.BODY_FIELD); String body = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : ""; f = doc.getField(DocMaker.TITLE_FIELD); String title = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : ""; if (body.length() > 0 || title.length() > 0) { f = doc.getField(DocMaker.DATE_FIELD); String date = f != null ? matcher.reset(f.stringValue()).replaceAll(" ") : ""; StringBuilder sb = threadBuffer.get(); if (sb == null) { sb = new StringBuilder(); threadBuffer.set(sb); } sb.setLength(0); sb.append(title).append(SEP).append(date).append(SEP).append(body); // lineFileOut is a PrintWriter, which synchronizes internally in println. lineFileOut.println(sb.toString()); } return 1; } @Override public void close() throws Exception { lineFileOut.close(); super.close(); } /** * Set the params (docSize only) * @param params docSize, or 0 for no limit. */ @Override public void setParams(String params) { if (super.supportsParams()) { super.setParams(params); } docSize = (int) Float.parseFloat(params); } @Override public boolean supportsParams() { return true; } }