/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.quality.trec; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import org.apache.lucene.benchmark.quality.QualityQuery; /** * Read TREC topics. * <p> * Expects this topic format - * <pre> * <top> * <num> Number: nnn * * <title> title of the topic * * <desc> Description: * description of the topic * * <narr> Narrative: * "story" composed by assessors. * * </top> * </pre> * Comment lines starting with '#' are ignored. */ public class TrecTopicsReader { private static final String newline = System.getProperty("line.separator"); /** * Constructor for Trec's TopicsReader */ public TrecTopicsReader() { super(); } /** * Read quality queries from trec format topics file. * @param reader where queries are read from. * @return the result quality queries. * @throws IOException if cannot read the queries. */ public QualityQuery[] readQueries(BufferedReader reader) throws IOException { ArrayList<QualityQuery> res = new ArrayList<QualityQuery>(); StringBuffer sb; try { while (null!=(sb=read(reader,"<top>",null,false,false))) { HashMap<String,String> fields = new HashMap<String,String>(); // id sb = read(reader,"<num>",null,true,false); int k = sb.indexOf(":"); String id = sb.substring(k+1).trim(); // title sb = read(reader,"<title>",null,true,false); k = sb.indexOf(">"); String title = sb.substring(k+1).trim(); // description read(reader,"<desc>",null,false,false); sb.setLength(0); String line = null; while ((line = reader.readLine()) != null) { if (line.startsWith("<narr>")) break; if (sb.length() > 0) sb.append(' '); sb.append(line); } String description = sb.toString().trim(); // narrative sb.setLength(0); while ((line = reader.readLine()) != null) { if (line.startsWith("</top>")) break; if (sb.length() > 0) sb.append(' '); sb.append(line); } String narrative = sb.toString().trim(); // we got a topic! fields.put("title",title); fields.put("description",description); fields.put("narrative", narrative); QualityQuery topic = new QualityQuery(id,fields); res.add(topic); } } finally { reader.close(); } // sort result array (by ID) QualityQuery qq[] = res.toArray(new QualityQuery[0]); Arrays.sort(qq); return qq; } // read until finding a line that starts with the specified prefix private StringBuffer read (BufferedReader reader, String prefix, StringBuffer sb, boolean collectMatchLine, boolean collectAll) throws IOException { sb = (sb==null ? new StringBuffer() : sb); String sep = ""; while (true) { String line = reader.readLine(); if (line==null) { return null; } if (line.startsWith(prefix)) { if (collectMatchLine) { sb.append(sep+line); sep = newline; } break; } if (collectAll) { sb.append(sep+line); sep = newline; } } //System.out.println("read: "+sb); return sb; } }