package hu.u_szeged.kpe.readers; import hu.u_szeged.utils.NLPUtils; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class HulthReader extends GeneralReader { private static final String DEFAULT_EXTENSION = ".abstr"; private Map<String, String> etalonKeyphrases; protected void setDetails() { fileType = DEFAULT_EXTENSION; } @Override public List<DocumentData> getContent(String dir, String file) { if (etalonKeyphrases == null) { readEtalonKeyphrases(dir); } List<DocumentData> toReturn = new ArrayList<DocumentData>(1); toReturn.add(new DocumentData(etalonKeyphrases.get(new File(file).getName().replace(fileType, "")), file, this.getClass())); return toReturn; } public String getText(String file, int numberWithinFile) { StringBuffer sb = new StringBuffer(); List<String> lines = new ArrayList<>(); NLPUtils.readDocToCollection(file, lines); boolean firstLine = true; for (String line : lines) { sb.append(line + (firstLine ? "\n" : " ")); firstLine = false; } return sb.toString(); } private void readEtalonKeyphrases(String dir) { etalonKeyphrases = new HashMap<String, String>(); for (String fileName : new File(dir).list()) { if (fileName.endsWith(".uncontr")) { List<String> lines = new ArrayList<>(); NLPUtils.readDocToCollection(dir + "/" + fileName, lines); StringBuffer sb = new StringBuffer(); for (String line : lines) { sb.append(line + " "); } etalonKeyphrases.put(fileName.replace(".uncontr", ""), sb.toString().trim().replaceAll("\\s+", " ").replaceAll(";\\s+", "\n")); } } } }