/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.qa; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import java.util.Properties; import org.apache.lucene.benchmark.byTask.feeds.ContentSource; import org.apache.lucene.benchmark.byTask.feeds.DocData; import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException; import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.benchmark.byTask.utils.StreamUtils; public class WexWikiContentSource extends ContentSource { private static final Map<String,Integer> ELEMENTS = new HashMap<String,Integer>(); private static final int TITLE = 0; private static final int DATE = TITLE + 1; private static final int BODY = DATE + 1; private static final int ID = BODY + 1; private static final int CATEGORY = ID + 1; private static final int LENGTH = CATEGORY + 1; // LENGTH is used as the size of the tuple, so whatever constants we need that // should not be part of the tuple, we should define them after LENGTH. private static final int PAGE = LENGTH + 1; private BufferedReader ir; WexWikiContentSource() { } private class Parser { String[] tuple = new String[LENGTH]; public String[] next() throws IOException { String[] parts; do { String line = ir.readLine(); if (line == null) return null; parts = line.split("\\t"); } while (parts.length != 5); tuple[ID] = parts[0]; tuple[TITLE] = parts[1]; tuple[DATE] = parts[2]; tuple[BODY] = parts[4]; tuple[CATEGORY] = parseCategory(parts[3]); return tuple; } final String CATEGORY_PREFIX = "<target>Category:"; final int CATEGORY_PREFIX_LEN = CATEGORY_PREFIX.length()-1; final String CATEGORY_SUFFIX = "</target>"; final int CATEGORY_SUFFIX_LEN = CATEGORY_SUFFIX.length()-1; final StringBuilder b = new StringBuilder(); public String parseCategory(String input) { b.setLength(0); int start = 0; int end = 0; while (true) { start = input.indexOf("target>Category:", end); if (start < 0) break; start += CATEGORY_PREFIX_LEN; end = input.indexOf("</target>", start); if (end < start) break; b.append(input.substring(start, end)).append(";;"); end += CATEGORY_SUFFIX_LEN; } return b.toString(); } } private File file; Parser parser = new Parser(); public void resetInputs() throws IOException { super.resetInputs(); ir = getReader(file); } public BufferedReader getReader(File file) throws IOException { InputStream is = StreamUtils.inputStream(file); return new BufferedReader(new InputStreamReader(is, "UTF-8")); } @Override public void close() throws IOException { if (ir != null) { ir.close(); ir = null; } } Properties props = new Properties(); @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { if (ir == null) { ir = getReader(file); } String[] tuple = parser.next(); if (tuple == null) return null; docData.clear(); docData.setID(Integer.parseInt(tuple[ID])); docData.setTitle(tuple[TITLE]); docData.setBody(tuple[BODY]); docData.setDate(tuple[DATE]); props.setProperty("category", tuple[CATEGORY]); docData.setProps(props); return docData; } @Override public void setConfig(Config config) { super.setConfig(config); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = new File(fileName).getAbsoluteFile(); } }