package hu.u_szeged.kpe.readers; import hu.u_szeged.utils.NLPUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.regex.Pattern; public class ScientificReader extends KpeReader { // really ugly pattern for detecting numeric expressions protected static final Pattern NUM_PATTERN = Pattern.compile("(([-+±]|(\\+/-))?\\d+([.,]\\d+)?%?)+"); protected void setDetails() { fileType = DEFAULT_EXTENSION; } public List<DocumentData> getContent(String directory, String file) { String filePathChunk = file.substring(0, file.indexOf(fileType)); StringBuffer keywords = new StringBuffer(); File keyphraseFile = new File(filePathChunk.replace("Content", "KeyPhrase") + ".key"); if (keyphraseFile.exists()) { List<String> lines = new LinkedList<>(); NLPUtils.readDocToCollection(keyphraseFile, lines); for (String st : lines) keywords.append(st + "\r\n"); } List<DocumentData> toReturn = new ArrayList<DocumentData>(1); toReturn.add(new DocumentData(keywords.toString(), file, this.getClass())); return toReturn; } // public TreeMap<Integer, List<CoreMap>> sectionMapping(DocumentData doc) { // List<CoreMap> grammar = tagAndParse(doc).get(SentencesAnnotation.class); // int actualSection = 0; // TreeMap<Integer, List<CoreMap>> sectionSentences = new TreeMap<Integer, List<CoreMap>>(); // sectionSentences.put(actualSection, new LinkedList<CoreMap>()); // boolean refSeen = false; // for (int s = 0; s < grammar.size(); ++s){ // CoreMap sentence = grammar.get(s); // List<CoreLabel> tokens = sentence.get(TokensAnnotation.class); // String firstToken = tokens.get(0).word(); // StringBuffer firstThreeTokens = new StringBuffer(firstToken); // for (int i = 1; i < 3 && i < tokens.size(); ++i){ // firstThreeTokens.append(' ' + tokens.get(i).word()); // } // // if (!refSeen && firstToken.matches("\\d+")){ // int sect = Integer.parseInt(firstToken); // if (sect > actualSection && sect - actualSection < 3 && sect != actualSection){ // // in case a section header were missed // boolean ok = true; // for (int i = 1; i < 6 && s + i < grammar.size(); ++i){ // String firstExtendedWordISentenceAway = grammar.get(s + // i).get(TokensAnnotation.class).get(0).word(); // if (firstExtendedWordISentenceAway.matches("\\d+") // && Integer.parseInt(firstExtendedWordISentenceAway) - sect == 1){ // ok = false; // break; // } // } // if (ok) // sectionSentences.put((actualSection = sect), new LinkedList<CoreMap>()); // } // }else if (firstThreeTokens.toString().matches("(?i)(\\d+ (\\. )?)?references?")){ // refSeen = true; // sectionSentences.put(++actualSection, new LinkedList<CoreMap>()); // } // sectionSentences.get(actualSection).add(sentence); // } // return sectionSentences; // } @Override protected boolean mightBeSectionHeader(String line) { return line.matches("[1-9]\\d?(\\.\\d+\\.?)*\\.?\\s.+"); } public boolean hasUnwantedLastParagraph() { return true; } @Override public String getText(String file, int numberWithinFile) { StringBuffer article = new StringBuffer(); try { boolean firstAbstractSeen = false; String line; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), getEncoding())); while ((line = br.readLine()) != null) { if (!firstAbstractSeen) { int originalLength = line.length(); line = line.replaceAll(" A(bstract|BSTRACT) ", "\r\nAbstract\r\n"); firstAbstractSeen = line.length() == originalLength + 2; } article.append(line + "\r\n"); } br.close(); } catch (IOException e) { e.printStackTrace(); } return article.toString(); } }