/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask.feeds; import java.io.Closeable; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.Random; import java.util.TimeZone; import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; import org.apache.lucene.document.Field; import org.apache.lucene.document.FloatPoint; import org.apache.lucene.document.IntPoint; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexOptions; /** * Creates {@link Document} objects. Uses a {@link ContentSource} to generate * {@link DocData} objects. Supports the following parameters: * <ul> * <li><b>content.source</b> - specifies the {@link ContentSource} class to use * (default <b>SingleDocSource</b>). * <li><b>doc.stored</b> - specifies whether fields should be stored (default * <b>false</b>). * <li><b>doc.body.stored</b> - specifies whether the body field should be stored (default * = <b>doc.stored</b>). * <li><b>doc.tokenized</b> - specifies whether fields should be tokenized * (default <b>true</b>). * <li><b>doc.body.tokenized</b> - specifies whether the * body field should be tokenized (default = <b>doc.tokenized</b>). * <li><b>doc.body.offsets</b> - specifies whether to add offsets into the postings index * for the body field. It is useful for highlighting. (default <b>false</b>) * <li><b>doc.tokenized.norms</b> - specifies whether norms should be stored in * the index or not. (default <b>false</b>). * <li><b>doc.body.tokenized.norms</b> - specifies whether norms should be * stored in the index for the body field. This can be set to true, while * <code>doc.tokenized.norms</code> is set to false, to allow norms storing just * for the body field. (default <b>true</b>). * <li><b>doc.term.vector</b> - specifies whether term vectors should be stored * for fields (default <b>false</b>). * <li><b>doc.term.vector.positions</b> - specifies whether term vectors should * be stored with positions (default <b>false</b>). * <li><b>doc.term.vector.offsets</b> - specifies whether term vectors should be * stored with offsets (default <b>false</b>). * <li><b>doc.store.body.bytes</b> - specifies whether to store the raw bytes of * the document's content in the document (default <b>false</b>). * <li><b>doc.reuse.fields</b> - specifies whether Field and Document objects * should be reused (default <b>true</b>). * <li><b>doc.index.props</b> - specifies whether the properties returned by * <li><b>doc.random.id.limit</b> - if specified, docs will be assigned random * IDs from 0 to this limit. This is useful with UpdateDoc * for testing performance of IndexWriter.updateDocument. * {@link DocData#getProps()} will be indexed. (default <b>false</b>). * </ul> */ public class DocMaker implements Closeable { private static class LeftOver { private DocData docdata; private int cnt; } private Random r; private int updateDocIDLimit; /** * Document state, supports reuse of field instances * across documents (see <code>reuseFields</code> parameter). */ protected static class DocState { private final Map<String,Field> fields; private final Map<String,Field> numericFields; private final boolean reuseFields; final Document doc; DocData docData = new DocData(); public DocState(boolean reuseFields, FieldType ft, FieldType bodyFt) { this.reuseFields = reuseFields; if (reuseFields) { fields = new HashMap<>(); numericFields = new HashMap<>(); // Initialize the map with the default fields. fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyFt)); fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", ft)); fields.put(DATE_FIELD, new Field(DATE_FIELD, "", ft)); fields.put(ID_FIELD, new StringField(ID_FIELD, "", Field.Store.YES)); fields.put(NAME_FIELD, new Field(NAME_FIELD, "", ft)); numericFields.put(DATE_MSEC_FIELD, new LongPoint(DATE_MSEC_FIELD, 0L)); numericFields.put(TIME_SEC_FIELD, new IntPoint(TIME_SEC_FIELD, 0)); doc = new Document(); } else { numericFields = null; fields = null; doc = null; } } /** * Returns a field corresponding to the field name. If * <code>reuseFields</code> was set to true, then it attempts to reuse a * Field instance. If such a field does not exist, it creates a new one. */ Field getField(String name, FieldType ft) { if (!reuseFields) { return new Field(name, "", ft); } Field f = fields.get(name); if (f == null) { f = new Field(name, "", ft); fields.put(name, f); } return f; } Field getNumericField(String name, Class<? extends Number> numericType) { Field f; if (reuseFields) { f = numericFields.get(name); } else { f = null; } if (f == null) { if (numericType.equals(Integer.class)) { f = new IntPoint(name, 0); } else if (numericType.equals(Long.class)) { f = new LongPoint(name, 0L); } else if (numericType.equals(Float.class)) { f = new FloatPoint(name, 0.0F); } else if (numericType.equals(Double.class)) { f = new DoublePoint(name, 0.0); } else { throw new UnsupportedOperationException("Unsupported numeric type: " + numericType); } if (reuseFields) { numericFields.put(name, f); } } return f; } } private boolean storeBytes = false; private static class DateUtil { public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH); public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); public ParsePosition pos = new ParsePosition(0); public DateUtil() { parser.setLenient(true); } } // leftovers are thread local, because it is unsafe to share residues between threads private ThreadLocal<LeftOver> leftovr = new ThreadLocal<>(); private ThreadLocal<DocState> docState = new ThreadLocal<>(); private ThreadLocal<DateUtil> dateParsers = new ThreadLocal<>(); public static final String BODY_FIELD = "body"; public static final String TITLE_FIELD = "doctitle"; public static final String DATE_FIELD = "docdate"; public static final String DATE_MSEC_FIELD = "docdatenum"; public static final String TIME_SEC_FIELD = "doctimesecnum"; public static final String ID_FIELD = "docid"; public static final String BYTES_FIELD = "bytes"; public static final String NAME_FIELD = "docname"; protected Config config; protected FieldType valType; protected FieldType bodyValType; protected ContentSource source; protected boolean reuseFields; protected boolean indexProperties; private final AtomicInteger numDocsCreated = new AtomicInteger(); public DocMaker() { } // create a doc // use only part of the body, modify it to keep the rest (or use all if size==0). // reset the docdata properties so they are not added more than once. private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { final DocState ds = getDocState(); final Document doc = reuseFields ? ds.doc : new Document(); doc.clear(); // Set ID_FIELD FieldType ft = new FieldType(valType); ft.setStored(true); Field idField = ds.getField(ID_FIELD, ft); int id; if (r != null) { id = r.nextInt(updateDocIDLimit); } else { id = docData.getID(); if (id == -1) { id = numDocsCreated.getAndIncrement(); } } idField.setStringValue(Integer.toString(id)); doc.add(idField); // Set NAME_FIELD String name = docData.getName(); if (name == null) name = ""; name = cnt < 0 ? name : name + "_" + cnt; Field nameField = ds.getField(NAME_FIELD, valType); nameField.setStringValue(name); doc.add(nameField); // Set DATE_FIELD DateUtil util = dateParsers.get(); if (util == null) { util = new DateUtil(); dateParsers.set(util); } Date date = null; String dateString = docData.getDate(); if (dateString != null) { util.pos.setIndex(0); date = util.parser.parse(dateString, util.pos); //System.out.println(dateString + " parsed to " + date); } else { dateString = ""; } Field dateStringField = ds.getField(DATE_FIELD, valType); dateStringField.setStringValue(dateString); doc.add(dateStringField); if (date == null) { // just set to right now date = new Date(); } Field dateField = ds.getNumericField(DATE_MSEC_FIELD, Long.class); dateField.setLongValue(date.getTime()); doc.add(dateField); util.cal.setTime(date); final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND); Field timeSecField = ds.getNumericField(TIME_SEC_FIELD, Integer.class); timeSecField.setIntValue(sec); doc.add(timeSecField); // Set TITLE_FIELD String title = docData.getTitle(); Field titleField = ds.getField(TITLE_FIELD, valType); titleField.setStringValue(title == null ? "" : title); doc.add(titleField); String body = docData.getBody(); if (body != null && body.length() > 0) { String bdy; if (size <= 0 || size >= body.length()) { bdy = body; // use all docData.setBody(""); // nothing left } else { // attempt not to break words - if whitespace found within next 20 chars... for (int n = size - 1; n < size + 20 && n < body.length(); n++) { if (Character.isWhitespace(body.charAt(n))) { size = n; break; } } bdy = body.substring(0, size); // use part docData.setBody(body.substring(size)); // some left } Field bodyField = ds.getField(BODY_FIELD, bodyValType); bodyField.setStringValue(bdy); doc.add(bodyField); if (storeBytes) { Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED); bytesField.setBytesValue(bdy.getBytes(StandardCharsets.UTF_8)); doc.add(bytesField); } } if (indexProperties) { Properties props = docData.getProps(); if (props != null) { for (final Map.Entry<Object,Object> entry : props.entrySet()) { Field f = ds.getField((String) entry.getKey(), valType); f.setStringValue((String) entry.getValue()); doc.add(f); } docData.setProps(null); } } //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc; } private void resetLeftovers() { leftovr.set(null); } protected DocState getDocState() { DocState ds = docState.get(); if (ds == null) { ds = new DocState(reuseFields, valType, bodyValType); docState.set(ds); } return ds; } /** * Closes the {@link DocMaker}. The base implementation closes the * {@link ContentSource}, and it can be overridden to do more work (but make * sure to call super.close()). */ @Override public void close() throws IOException { source.close(); } /** * Creates a {@link Document} object ready for indexing. This method uses the * {@link ContentSource} to get the next document from the source, and creates * a {@link Document} object from the returned fields. If * <code>reuseFields</code> was set to true, it will reuse {@link Document} * and {@link Field} instances. */ public Document makeDocument() throws Exception { resetLeftovers(); DocData docData = source.getNextDocData(getDocState().docData); Document doc = createDocument(docData, 0, -1); return doc; } /** * Same as {@link #makeDocument()}, only this method creates a document of the * given size input by <code>size</code>. */ public Document makeDocument(int size) throws Exception { LeftOver lvr = leftovr.get(); if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null || lvr.docdata.getBody().length() == 0) { resetLeftovers(); } DocData docData = getDocState().docData; DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata); int cnt = (lvr == null ? 0 : lvr.cnt); while (dd.getBody() == null || dd.getBody().length() < size) { DocData dd2 = dd; dd = source.getNextDocData(new DocData()); cnt = 0; dd.setBody(dd2.getBody() + dd.getBody()); } Document doc = createDocument(dd, size, cnt); if (dd.getBody() == null || dd.getBody().length() == 0) { resetLeftovers(); } else { if (lvr == null) { lvr = new LeftOver(); leftovr.set(lvr); } lvr.docdata = dd; lvr.cnt = ++cnt; } return doc; } /** Reset inputs so that the test run would behave, input wise, as if it just started. */ public synchronized void resetInputs() throws IOException { source.printStatistics("docs"); // re-initiate since properties by round may have changed. setConfig(config, source); source.resetInputs(); numDocsCreated.set(0); resetLeftovers(); } /** Set the configuration parameters of this doc maker. */ public void setConfig(Config config, ContentSource source) { this.config = config; this.source = source; boolean stored = config.get("doc.stored", false); boolean bodyStored = config.get("doc.body.stored", stored); boolean tokenized = config.get("doc.tokenized", true); boolean bodyTokenized = config.get("doc.body.tokenized", tokenized); boolean norms = config.get("doc.tokenized.norms", false); boolean bodyNorms = config.get("doc.body.tokenized.norms", true); boolean bodyOffsets = config.get("doc.body.offsets", false); boolean termVec = config.get("doc.term.vector", false); boolean termVecPositions = config.get("doc.term.vector.positions", false); boolean termVecOffsets = config.get("doc.term.vector.offsets", false); valType = new FieldType(TextField.TYPE_NOT_STORED); valType.setStored(stored); valType.setTokenized(tokenized); valType.setOmitNorms(!norms); valType.setStoreTermVectors(termVec); valType.setStoreTermVectorPositions(termVecPositions); valType.setStoreTermVectorOffsets(termVecOffsets); valType.freeze(); bodyValType = new FieldType(TextField.TYPE_NOT_STORED); bodyValType.setStored(bodyStored); bodyValType.setTokenized(bodyTokenized); bodyValType.setOmitNorms(!bodyNorms); if (bodyTokenized && bodyOffsets) { bodyValType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } bodyValType.setStoreTermVectors(termVec); bodyValType.setStoreTermVectorPositions(termVecPositions); bodyValType.setStoreTermVectorOffsets(termVecOffsets); bodyValType.freeze(); storeBytes = config.get("doc.store.body.bytes", false); reuseFields = config.get("doc.reuse.fields", true); // In a multi-rounds run, it is important to reset DocState since settings // of fields may change between rounds, and this is the only way to reset // the cache of all threads. docState = new ThreadLocal<>(); indexProperties = config.get("doc.index.props", false); updateDocIDLimit = config.get("doc.random.id.limit", -1); if (updateDocIDLimit != -1) { r = new Random(179); } } }