/*******************************************************************************
* Copyright (c) 2012 György Orosz, Attila Novák.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Lesser Public License v3
* which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/
*
* This file is part of PurePos.
*
* PurePos is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* PurePos is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* Contributors:
* György Orosz - initial API and implementation
******************************************************************************/
package hu.ppke.itk.nlpg.corpusreader;
import hu.ppke.itk.nlpg.docmodel.IDocElement;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Scanner;
/**
* Abstract class for reading a coprus
*
* @author György Orosz
*
* @param <C>
* DocElement type which is read
*/
public abstract class AbstractDocElementReader<C extends IDocElement>
implements ICorpusReader<C> {
protected String lineSeparator = "\n";// System.getProperty("line.separator");
protected String fileEncoding = "UTF-8";
protected String separator;
@Override
public C readFromFile(File file) throws ParsingException {
String text;
try {
text = readFile(file);
return read(text);
} catch (FileNotFoundException e) {
throw new ParsingException(e);
}
}
@Override
public C readFromScanner(Scanner sc) throws ParsingException {
String text;
text = readScanner(sc);
return read(text);
}
/**
* Sets the encoding for reading a text file. The default is UTF-8.
*
* @param enc
* file encoding
*/
public void setEncoding(String enc) {
this.fileEncoding = enc;
}
/**
* Sets the line separator string for reading. This character is appended
* after each line during the reading.
*
* @param sep
* line end marker character
*/
public void setLineSeparator(String sep) {
this.lineSeparator = sep;
}
/**
* Reads the text content from a file.
*
* @param file
* @return
* @throws FileNotFoundException
*/
protected String readFile(File file) throws FileNotFoundException {
Scanner scanner = new Scanner(file, fileEncoding);
return readScanner(scanner);
}
protected String readScanner(Scanner scanner) {
StringBuffer ret = new StringBuffer();
while (scanner.hasNext()) {
ret.append(scanner.nextLine() + lineSeparator);
}
return ret.toString();
}
}