package hu.u_szeged.kpe.readers;
import java.io.File;
import java.nio.charset.Charset;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.pipeline.SzTECoreNLP;
public abstract class KpeReader {
public static SzTECoreNLP sentenceAnalyzer;
/**
* On default (unless overridden by a descendant class) we shall only treat files having this kind of
* extension
*/
protected static final String DEFAULT_EXTENSION = ".txt";
protected String fileType;
protected Charset m_encoding;
private boolean isMweOn;
private boolean isNeOn;
private boolean isSyntaxOn;
protected boolean goldAnnotation;
/**
* This method returns a {@link}List of DocumentData class representations of documents being present at
* location dir+"/"+file <br>
* It returns a list of {@link} DocumentData objects as one document might contain more than just one
* document (e.g in the form of an XML). <br>
* Most often however, this List is going to be just of size 1.
*
* @param dir
* @param file
* @return
*/
public abstract List<DocumentData> getContent(String dir, String file);
public abstract String getText(String file, int numberWithinFile);
protected abstract boolean mightBeSectionHeader(String line);
protected abstract void setDetails();
public KpeReader() {
m_encoding = Charset.defaultCharset();
setDetails();
}
public KpeReader(String extension) {
this(Charset.defaultCharset(), extension);
}
public KpeReader(Charset encoding, String extension) {
m_encoding = encoding;
fileType = extension;
}
public boolean getIsMweOn() {
return isMweOn;
}
public boolean getIsNeOn() {
return isNeOn;
}
public boolean getIsSyntaxOn() {
return isSyntaxOn;
}
public void initGrammar(boolean isMweFeatureOn, boolean isNeFeatureOn, boolean isSyntacticFeatureOn, String lang) {
isMweOn = isMweFeatureOn;
isNeOn = isNeFeatureOn;
isSyntaxOn = isSyntacticFeatureOn;
if (sentenceAnalyzer == null) {
String annotators = "tokenize, ssplit, pos, lemma, stopword";
// the following annotators are optional and only available in English at the moment
annotators += isMweFeatureOn && lang.equals("en") ? ", mwe" : "";
annotators += isNeFeatureOn && lang.equals("en") ? ", ner" : "";
annotators += isSyntacticFeatureOn && lang.equals("en") ? ", parse" : "";
Properties props = new Properties();
if (this instanceof EpinionReader) {
annotators = annotators.replace("tokenize,", "tokenize, cleanxml,");
}
props.put("annotators", annotators);
// TODO this is clearly not necessary in all the cases
// this should be used only when it is desired for new lines to act as sentence boundaries
props.put("ssplit.boundariesToDiscard", "*NL*");
props.put("tokenize.options", "invertible,ptb3Escaping=true,tokenizeNLs");
props.put("pos.maxlen", "100");
props.put("lang", lang);
if (isMweFeatureOn) {
props.put("mwe.file", System.getProperty("user.dir") + "/resources/wikiMWEfreqs.txt");
}
if (isNeFeatureOn) {
props.put("ner.useSUTime", "false");
}
if (isSyntacticFeatureOn) {
props.put("parser.maxlen", "100");
}
sentenceAnalyzer = new SzTECoreNLP(props);
}
}
protected void setFileExtension(String ext) {
fileType = ext;
}
public void setUseGoldAnnotation(boolean ga) {
goldAnnotation = ga;
}
public Charset getEncoding() {
return m_encoding;
}
/**
* @param dir
* - directory to search for the documents
* @param train
* - whether the documents will serve as train instances
*/
public void addDirectoryOfFiles(String dir, boolean train, DocumentSet docSet) {
for (File f : new File(dir).listFiles()) {
if (f.getName().endsWith(fileType)) {
for (DocumentData dd : getContent(dir, f.getAbsolutePath())) {
if (!train || (dd.getKeyphrases() != null && dd.getKeyphrases().size() > 0)) {
docSet.add(dd);
}
}
}
}
}
}