/* * Copyright (c) 2013 Websquared, Inc. * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Public License v2.0 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html * * Contributors: * swsong - initial API and implementation */ package org.fastcatsearch.datasource.reader; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.fastcatsearch.datasource.SourceModifier; import org.fastcatsearch.datasource.reader.annotation.SourceReader; import org.fastcatsearch.env.Environment; import org.fastcatsearch.ir.common.IRException; import org.fastcatsearch.ir.config.DataSourceConfig; import org.fastcatsearch.ir.config.SingleSourceConfig; import org.fastcatsearch.ir.index.PrimaryKeys; import org.fastcatsearch.ir.io.DirBufferedReader; import org.fastcatsearch.ir.settings.SchemaSetting; @SourceReader(name="DUMP_FILE") public class DumpFileSourceReader extends SingleSourceReader<Map<String, Object>> { protected DirBufferedReader br; protected Map<String, Object> dataMap; protected static String DOC_START = "<doc>"; protected static String DOC_END = "</doc>"; protected static String OPEN_PATTERN = "^<([\\w]+[^>]*)>$"; protected static String CLOSE_PATTERN = "^<\\/([\\w]+[^>]*)>$"; protected Pattern OPAT; protected Pattern CPAT; public DumpFileSourceReader() { super(); } public DumpFileSourceReader(String collectionId, File filePath, SingleSourceConfig singleSourceConfig, SourceModifier<Map<String, Object>> sourceModifier, String lastIndexTime) throws IRException { super(collectionId, filePath, singleSourceConfig, sourceModifier, lastIndexTime); } @Override public void init() throws IRException { String fileEncoding = getConfigString("encoding"); if (fileEncoding == null) { fileEncoding = Charset.defaultCharset().toString(); } try { File file = filePath.makePath(getConfigString("filepath")).file(); br = new DirBufferedReader(file, fileEncoding); logger.info("Collect file = {}, {}", file.getAbsolutePath(), fileEncoding); } catch (UnsupportedEncodingException e) { logger.error(e.getMessage(), e); throw new IRException(e); } catch (FileNotFoundException e) { logger.error(e.getMessage(), e); throw new IRException(e); } catch (IOException e) { logger.error(e.getMessage(), e); throw new IRException(e); } dataMap = null; OPAT = Pattern.compile(OPEN_PATTERN); CPAT = Pattern.compile(CLOSE_PATTERN); } @Override public boolean hasNext() throws IRException { String line = null; dataMap = new HashMap<String, Object>(); String oneDoc = readOneDoc(); if (oneDoc == null) { return false; } BufferedReader reader = new BufferedReader(new StringReader(oneDoc)); StringBuffer sb = new StringBuffer(); String openTag = ""; boolean isOpened = false; while (true) { try { line = reader.readLine(); if (line == null) { break; } if (line.length() == 0) { continue; } line = line.trim(); if (line.length() > 1 && line.charAt(0) == '<' && line.charAt(1) != '/') { Matcher m = OPAT.matcher(line); if (m.matches()) { if(!isOpened){ String tag = m.group(1); openTag = tag; isOpened = true; // if (logger.isTraceEnabled()) { // logger.trace("OpenTag [{}]", tag); // } continue; } } } if (isOpened && line.startsWith("</")) { Matcher m = CPAT.matcher(line); if (m.matches()) { String closeTag = m.group(1); if (openTag.equals(closeTag)) { isOpened = false; String targetStr = sb.toString(); // if (logger.isTraceEnabled()) { // logger.trace("CloseTag [{}]", closeTag); // logger.trace("Data [{}]", targetStr); // } dataMap.put(openTag.toUpperCase(), targetStr); sb = new StringBuffer(); continue; } } } if (sb.length() > 0) { sb.append(Environment.LINE_SEPARATOR); } sb.append(line); // logger.debug(sb.toString()); } catch (IOException e) { logger.error(e.getMessage(), e); throw new IRException(e); } } // logger.debug("doc = "+document); if (dataMap == null) return false; return true; } private String checkDeleteDocs(String line) throws IOException { while ("<delete_doc>".equals(line)) { line = nextLine(); if (line == null) { return null; } int keySize = deleteIdList.keySize(); PrimaryKeys pk = new PrimaryKeys(keySize); int i = 0; while(!line.equals("</delete_doc>")) { pk.set(i++, line); line = nextLine(); } logger.debug("Delete request>> {}", pk); deleteIdList.add(pk); line = nextLine(); } return line; } protected String nextLine() throws IOException { String line = br.readLine(); if (line == null) { return null; } line = line.trim(); while (line.length() == 0) { line = nextLine(); if (line == null) { return null; } line = line.trim(); } return line; } private String readOneDoc() throws IRException { try { StringBuffer sb = new StringBuffer(); String line = nextLine(); if (line == null) { return null; } line = checkDeleteDocs(line); if (line == null) { return null; } int lineNumber = 0; while (!line.equals(DOC_START)) { // doc opened line = nextLine(); if (line == null) { return null; } } line = nextLine(); // doc started while (!line.equals(DOC_END)) { // doc ended if (lineNumber >= 1) { sb.append(Environment.LINE_SEPARATOR); } sb.append(line); line = nextLine(); lineNumber++; } return sb.toString(); } catch (IOException e) { throw new IRException(e); } } @Override public Map<String, Object> next() throws IRException { return dataMap; } @Override public void close() throws IRException { try { if (br != null) { br.close(); } } catch (IOException e) { throw new IRException(e); } } @Override protected void initParameters() { registerParameter(new SourceReaderParameter("filepath", "File Path", "Filepath for indexing." , SourceReaderParameter.TYPE_STRING_LONG, true, null)); registerParameter(new SourceReaderParameter("encoding", "Encoding", "File encoding" , SourceReaderParameter.TYPE_STRING, true, null)); } }