package hu.u_szeged.kpe.readers; import hu.u_szeged.utils.NLPUtils; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * This is probably the simplest kind of {@link}KpeReader imaginable. <br> * Without making any real assumptions, this Reader simply treats all the contents of files as valuable content. <br> * Etalon keyphrases of documents in the directory are expected to be listed in a file named etalon.keys in the same directory where documents reside.<br> * Each line of the file etalon.keys should contain the absolute path of a document followed by a TAB and the list of etalon keyphrases for the * document. <br> * The presence of etalon keyphrases in file etalon.keys is crucial for training data. * * @author berend * */ public class GeneralReader extends KpeReader { private Map<String, String> etalonKeyphrases; @Override public List<DocumentData> getContent(String dir, String file) { if (etalonKeyphrases == null) { readEtalonKeyphrases(dir + "/etalon.keys"); } List<DocumentData> toReturn = new ArrayList<DocumentData>(1); toReturn.add(new DocumentData(etalonKeyphrases.get(new File(file).getName().replace(fileType, "")), file, this.getClass())); return toReturn; } @Override public String getText(String file, int numberWithinFile) { StringBuffer sb = new StringBuffer(); List<String> lines = new ArrayList<>(); NLPUtils.readDocToCollection(file, lines); for (String line : lines) { sb.append(line + "\n"); } return sb.toString(); } @Override protected boolean mightBeSectionHeader(String line) { // as this class intends to be very general, there seem to be no ways of generalizing section headers. return false; } @Override protected void setDetails() { fileType = DEFAULT_EXTENSION; } private void readEtalonKeyphrases(String path) { etalonKeyphrases = new HashMap<>(); List<String> lines = new ArrayList<>(); NLPUtils.readDocToCollection(path, lines); for (String line : lines) { String[] parts = line.split("\t"); etalonKeyphrases.put(parts[0], parts[1].replaceAll("\\s*,\\s*", "\n")); } } }