package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.InputStream;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.GZIPInputStream;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/** Minimal port of contrib/benchmark's LneDocSource +
* DocMaker, so tests can enum docs from a line file created
* by contrib/benchmark's WriteLineDoc task */
public class LineFileDocs implements Closeable {
private BufferedReader reader;
private final static int BUFFER_SIZE = 1 << 16; // 64K
private final AtomicInteger id = new AtomicInteger();
private final String path;
/** If forever is true, we rewind the file at EOF (repeat
* the docs over and over) */
public LineFileDocs(Random random, String path) throws IOException {
this.path = path;
open(random);
}
public LineFileDocs(Random random) throws IOException {
this(random, LuceneTestCase.TEST_LINE_DOCS_FILE);
}
public synchronized void close() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
}
private synchronized void open(Random random) throws IOException {
InputStream is = getClass().getResourceAsStream(path);
if (is == null) {
// if its not in classpath, we load it as absolute filesystem path (e.g. Hudson's home dir)
is = new FileInputStream(path);
}
File file = new File(path);
long size;
if (file.exists()) {
size = file.length();
} else {
size = is.available();
}
if (path.endsWith(".gz")) {
is = new GZIPInputStream(is);
// guestimate:
size *= 2.8;
}
reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), BUFFER_SIZE);
// Override sizes for currently "known" line files:
if (path.equals("europarl.lines.txt.gz")) {
size = 15129506L;
} else if (path.equals("/home/hudson/lucene-data/enwiki.random.lines.txt.gz")) {
size = 3038178822L;
}
// Randomly seek to starting point:
if (random != null && size > 3) {
final long seekTo = (random.nextLong()&Long.MAX_VALUE) % (size/3);
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: LineFileDocs: seek to fp=" + seekTo + " on open");
}
reader.skip(seekTo);
reader.readLine();
}
}
public synchronized void reset(Random random) throws IOException {
close();
open(random);
id.set(0);
}
private final static char SEP = '\t';
private static final class DocState {
final Document doc;
final Field titleTokenized;
final Field title;
final Field body;
final Field id;
final Field date;
public DocState() {
doc = new Document();
title = new Field("title", "", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
doc.add(title);
titleTokenized = new Field("titleTokenized", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(titleTokenized);
body = new Field("body", "", Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
doc.add(body);
id = new Field("docid", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
doc.add(id);
date = new Field("date", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS);
doc.add(date);
}
}
private final ThreadLocal<DocState> threadDocs = new ThreadLocal<DocState>();
/** Note: Document instance is re-used per-thread */
public Document nextDoc() throws IOException {
String line;
synchronized(this) {
line = reader.readLine();
if (line == null) {
// Always rewind at end:
if (LuceneTestCase.VERBOSE) {
System.out.println("TEST: LineFileDocs: now rewind file...");
}
close();
open(null);
line = reader.readLine();
}
}
DocState docState = threadDocs.get();
if (docState == null) {
docState = new DocState();
threadDocs.set(docState);
}
int spot = line.indexOf(SEP);
if (spot == -1) {
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
int spot2 = line.indexOf(SEP, 1 + spot);
if (spot2 == -1) {
throw new RuntimeException("line: [" + line + "] is in an invalid format !");
}
docState.body.setValue(line.substring(1+spot2, line.length()));
final String title = line.substring(0, spot);
docState.title.setValue(title);
docState.titleTokenized.setValue(title);
docState.date.setValue(line.substring(1+spot, spot2));
docState.id.setValue(Integer.toString(id.getAndIncrement()));
return docState.doc;
}
}