/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask.feeds; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.text.DateFormat; import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Locale; import org.apache.lucene.benchmark.byTask.feeds.TrecDocParser.ParsePathType; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StreamUtils; /** * Implements a {@link ContentSource} over the TREC collection. * <p> * Supports the following configuration parameters (on top of * {@link ContentSource}): * <ul> * <li><b>work.dir</b> - specifies the working directory. Required if "docs.dir" * denotes a relative path (<b>default=work</b>). * <li><b>docs.dir</b> - specifies the directory where the TREC files reside. * Can be set to a relative path if "work.dir" is also specified * (<b>default=trec</b>). * <li><b>trec.doc.parser</b> - specifies the {@link TrecDocParser} class to use for * parsing the TREC documents content (<b>default=TrecGov2Parser</b>). * <li><b>html.parser</b> - specifies the {@link HTMLParser} class to use for * parsing the HTML parts of the TREC documents content (<b>default=DemoHTMLParser</b>). * <li><b>content.source.encoding</b> - if not specified, ISO-8859-1 is used. * <li><b>content.source.excludeIteration</b> - if true, do not append iteration number to docname * </ul> */ public class TrecContentSource extends ContentSource { static final class DateFormatInfo { DateFormat[] dfs; ParsePosition pos; } public static final String DOCNO = "<DOCNO>"; public static final String TERMINATING_DOCNO = "</DOCNO>"; public static final String DOC = "<DOC>"; public static final String TERMINATING_DOC = "</DOC>"; /** separator between lines in the byffer */ public static final String NEW_LINE = System.getProperty("line.separator"); private static final String DATE_FORMATS [] = { "EEE, dd MMM yyyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT "EEE MMM dd kk:mm:ss yyyy z", // Tue Dec 09 16:45:08 2003 EST "EEE, dd-MMM-':'y kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT "EEE, dd-MMM-yyy kk:mm:ss z", // Tue, 09 Dec 2003 22:39:08 GMT "EEE MMM dd kk:mm:ss yyyy", // Tue Dec 09 16:45:08 2003 "dd MMM yyyy", // 1 March 1994 "MMM dd, yyyy", // February 3, 1994 "yyMMdd", // 910513 "hhmm z.z.z. MMM dd, yyyy", // 0901 u.t.c. April 28, 1994 }; private ThreadLocal<DateFormatInfo> dateFormats = new ThreadLocal<>(); private ThreadLocal<StringBuilder> trecDocBuffer = new ThreadLocal<>(); private Path dataDir = null; private ArrayList<Path> inputFiles = new ArrayList<>(); private int nextFile = 0; // Use to synchronize threads on reading from the TREC documents. private Object lock = new Object(); // Required for test BufferedReader reader; int iteration = 0; HTMLParser htmlParser; private boolean excludeDocnameIteration; private TrecDocParser trecDocParser = new TrecGov2Parser(); // default ParsePathType currPathType; // not private for tests private DateFormatInfo getDateFormatInfo() { DateFormatInfo dfi = dateFormats.get(); if (dfi == null) { dfi = new DateFormatInfo(); dfi.dfs = new SimpleDateFormat[DATE_FORMATS.length]; for (int i = 0; i < dfi.dfs.length; i++) { dfi.dfs[i] = new SimpleDateFormat(DATE_FORMATS[i], Locale.ENGLISH); dfi.dfs[i].setLenient(true); } dfi.pos = new ParsePosition(0); dateFormats.set(dfi); } return dfi; } private StringBuilder getDocBuffer() { StringBuilder sb = trecDocBuffer.get(); if (sb == null) { sb = new StringBuilder(); trecDocBuffer.set(sb); } return sb; } HTMLParser getHtmlParser() { return htmlParser; } /** * Read until a line starting with the specified <code>lineStart</code>. * @param buf buffer for collecting the data if so specified/ * @param lineStart line start to look for, must not be null. * @param collectMatchLine whether to collect the matching line into <code>buffer</code>. * @param collectAll whether to collect all lines into <code>buffer</code>. * @throws IOException If there is a low-level I/O error. * @throws NoMoreDataException If the source is exhausted. */ private void read(StringBuilder buf, String lineStart, boolean collectMatchLine, boolean collectAll) throws IOException, NoMoreDataException { String sep = ""; while (true) { String line = reader.readLine(); if (line == null) { openNextFile(); continue; } if (lineStart!=null && line.startsWith(lineStart)) { if (collectMatchLine) { buf.append(sep).append(line); sep = NEW_LINE; } return; } if (collectAll) { buf.append(sep).append(line); sep = NEW_LINE; } } } void openNextFile() throws NoMoreDataException, IOException { close(); currPathType = null; while (true) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } Path f = inputFiles.get(nextFile++); if (verbose) { System.out.println("opening: " + f + " length: " + Files.size(f)); } try { InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE); currPathType = TrecDocParser.pathType(f); return; } catch (Exception e) { if (verbose) { System.out.println("Skipping 'bad' file " + f.toAbsolutePath()+" due to "+e.getMessage()); continue; } throw new NoMoreDataException(); } } } public Date parseDate(String dateStr) { dateStr = dateStr.trim(); DateFormatInfo dfi = getDateFormatInfo(); for (int i = 0; i < dfi.dfs.length; i++) { DateFormat df = dfi.dfs[i]; dfi.pos.setIndex(0); dfi.pos.setErrorIndex(-1); Date d = df.parse(dateStr, dfi.pos); if (d != null) { // Parse succeeded. return d; } } // do not fail test just because a date could not be parsed if (verbose) { System.out.println("failed to parse date (assigning 'now') for: " + dateStr); } return null; } @Override public void close() throws IOException { if (reader == null) { return; } try { reader.close(); } catch (IOException e) { if (verbose) { System.out.println("failed to close reader !"); e.printStackTrace(System.out); } } reader = null; } @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String name = null; StringBuilder docBuf = getDocBuffer(); ParsePathType parsedPathType; // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run unprotected. synchronized (lock) { if (reader == null) { openNextFile(); } // 1. skip until doc start - required for all TREC formats docBuf.setLength(0); read(docBuf, DOC, false, false); // save parsedFile for passing trecDataParser after the sync block, in // case another thread will open another file in between. parsedPathType = currPathType; // 2. name - required for all TREC formats docBuf.setLength(0); read(docBuf, DOCNO, true, false); name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, DOCNO.length())).trim(); if (!excludeDocnameIteration) { name = name + "_" + iteration; } // 3. read all until end of doc docBuf.setLength(0); read(docBuf, TERMINATING_DOC, false, true); } // count char length of text to be parsed (may be larger than the resulted plain doc body text). addBytes(docBuf.length()); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType); addItem(); return docData; } @Override public void resetInputs() throws IOException { synchronized (lock) { super.resetInputs(); close(); nextFile = 0; iteration = 0; } } @Override public void setConfig(Config config) { super.setConfig(config); // dirs Path workDir = Paths.get(config.get("work.dir", "work")); String d = config.get("docs.dir", "trec"); dataDir = Paths.get(d); if (!dataDir.isAbsolute()) { dataDir = workDir.resolve(d); } // files try { collectFiles(dataDir, inputFiles); } catch (IOException e) { throw new RuntimeException(e); } if (inputFiles.size() == 0) { throw new IllegalArgumentException("No files in dataDir: " + dataDir); } // trec doc parser try { String trecDocParserClassName = config.get("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); trecDocParser = Class.forName(trecDocParserClassName).asSubclass(TrecDocParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } // html parser try { String htmlParserClassName = config.get("html.parser", "org.apache.lucene.benchmark.byTask.feeds.DemoHTMLParser"); htmlParser = Class.forName(htmlParserClassName).asSubclass(HTMLParser.class).newInstance(); } catch (Exception e) { // Should not get here. Throw runtime exception. throw new RuntimeException(e); } // encoding if (encoding == null) { encoding = StandardCharsets.ISO_8859_1.name(); } // iteration exclusion in doc name excludeDocnameIteration = config.get("content.source.excludeIteration", false); } }