package org.jabref.logic.importer.fileformat; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Objects; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.Importer; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.util.FileExtensions; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.Date; import org.jabref.model.entry.FieldName; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Imports a New Economics Papers-Message from the REPEC-NEP Service. * <p> * <p><a href="http://www.repec.org">RePEc (Research Papers in Economics)</a> * is a collaborative effort of over 100 volunteers in 49 countries * to enhance the dissemination of research in economics. The heart of * the project is a decentralized database of working papers, journal * articles and software components. All RePEc material is freely available.</p> * At the time of writing RePEc holds over 300.000 items.</p> * <p> * <p><a href="http://nep.repec.org">NEP (New Economic Papers)</a> is an announcement * service which filters information on new additions to RePEc into edited * reports. The goal is to provide subscribers with up-to-date information * to the research literature.</p> * <p> * <p>This importer is capable of importing NEP messages into JabRef.</p> * <p> * <p>There is no officially defined message format for NEP. NEP messages are assumed to have * (and almost always have) the form given by the following semi-formal grammar: * <pre> * NEPMessage: * MessageSection NEPMessage * MessageSection * * MessageSection: * OverviewMessageSection * OtherMessageSection * * # we skip the overview * OverviewMessageSection: * 'In this issue we have: ' SectionSeparator OtherStuff * * OtherMessageSection: * SectionSeparator OtherMessageSectionContent * * # we skip other stuff and read only full working paper references * OtherMessageSectionContent: * WorkingPaper EmptyLine OtherMessageSectionContent * OtherStuff EmptyLine OtherMessageSectionContent * '' * * OtherStuff: * NonEmptyLine OtherStuff * NonEmptyLine * * NonEmptyLine: * a non-empty String that does not start with a number followed by a '.' * * # working papers are recognized by a number followed by a '.' * # in a non-overview section * WorkingPaper: * Number'.' WhiteSpace TitleString EmptyLine Authors EmptyLine Abstract AdditionalFields * Number'.' WhiteSpace TitleString AdditionalFields Abstract AdditionalFields * * TitleString: * a String that may span several lines and should be joined * * # there must be at least one author * Authors: * Author '\n' Authors * Author '\n' * * # optionally, an institution is given for an author * Author: * AuthorName * AuthorName '(' Institution ')' * * # there are no rules about the name, it may be firstname lastname or lastname, firstname or anything else * AuthorName: * a non-empty String without '(' or ')' characters, not spanning more that one line * * Institution: * a non-empty String that may span several lines * * Abstract: * a (possibly empty) String that may span several lines * * AdditionalFields: * AdditionalField '\n' AdditionalFields * EmptyLine AdditionalFields * '' * * AdditionalField: * 'Keywords:' KeywordList * 'URL:' non-empty String * 'Date:' DateString * 'JEL:' JelClassificationList * 'By': Authors * * KeywordList: * Keyword ',' KeywordList * Keyword ';' KeywordList * Keyword * * Keyword: * non-empty String that does not contain ',' (may contain whitespace) * * # if no date is given, the current year as given by the system clock is assumed * DateString: * 'yyyy-MM-dd' * 'yyyy-MM' * 'yyyy' * * JelClassificationList: * JelClassification JelClassificationList * JelClassification * * # the JEL Classifications are set into a new BIBTEX-field 'jel' * # they will appear if you add it as a field to one of the BIBTex Entry sections * JelClassification: * one of the allowed classes, see http://ideas.repec.org/j/ * * SectionSeparator: * '\n-----------------------------' * </pre> * </p> * * @author andreas_sf at rudert-home dot de * @see <a href="http://nep.repec.org">NEP</a> */ public class RepecNepImporter extends Importer { private static final Log LOGGER = LogFactory.getLog(RepecNepImporter.class); private static final Collection<String> RECOGNIZED_FIELDS = Arrays.asList("Keywords", "JEL", "Date", "URL", "By"); private final ImportFormatPreferences importFormatPreferences; private int line; private String lastLine = ""; private String preLine = ""; private boolean inOverviewSection; public RepecNepImporter(ImportFormatPreferences importFormatPreferences) { this.importFormatPreferences = importFormatPreferences; } @Override public String getName() { return "REPEC New Economic Papers (NEP)"; } @Override public String getId() { return "repecnep"; } @Override public FileExtensions getExtensions() { return FileExtensions.REPEC; } @Override public String getDescription() { return "Imports a New Economics Papers-Message from the REPEC-NEP Service."; } @Override public boolean isRecognizedFormat(BufferedReader reader) throws IOException { // read the first couple of lines // NEP message usually contain the String 'NEP: New Economics Papers' // or, they are from nep.repec.org StringBuilder startOfMessage = new StringBuilder(); String tmpLine = reader.readLine(); for (int i = 0; (i < 25) && (tmpLine != null); i++) { startOfMessage.append(tmpLine); tmpLine = reader.readLine(); } return startOfMessage.toString().contains("NEP: New Economics Papers") || startOfMessage.toString().contains( "nep.repec.org"); } private boolean startsWithKeyword(Collection<String> keywords) { boolean result = this.lastLine.indexOf(':') >= 1; if (result) { String possibleKeyword = this.lastLine.substring(0, this.lastLine.indexOf(':')); result = keywords.contains(possibleKeyword); } return result; } private void readLine(BufferedReader in) throws IOException { this.line++; this.preLine = this.lastLine; this.lastLine = in.readLine(); } /** * Read multiple lines. * <p> * <p>Reads multiple lines until either * <ul> * <li>an empty line</li> * <li>the end of file</li> * <li>the next working paper or</li> * <li>a keyword</li> * </ul> * is found. Whitespace at start or end of lines is trimmed except for one blank character.</p> * * @return result */ private String readMultipleLines(BufferedReader in) throws IOException { StringBuilder result = new StringBuilder(this.lastLine.trim()); readLine(in); while ((this.lastLine != null) && !"".equals(this.lastLine.trim()) && !startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS) && !isStartOfWorkingPaper()) { result.append(this.lastLine.isEmpty() ? this.lastLine.trim() : " " + this.lastLine.trim()); readLine(in); } return result.toString(); } /** * Implements grammar rule "TitleString". * * @param be * @throws IOException */ private void parseTitleString(BibEntry be, BufferedReader in) throws IOException { // skip article number this.lastLine = this.lastLine.substring(this.lastLine.indexOf('.') + 1, this.lastLine.length()); be.setField(FieldName.TITLE, readMultipleLines(in)); } /** * Implements grammar rule "Authors" * * @param be * @throws IOException */ private void parseAuthors(BibEntry be, BufferedReader in) throws IOException { // read authors and institutions List<String> authors = new ArrayList<>(); StringBuilder institutions = new StringBuilder(); while ((this.lastLine != null) && !"".equals(this.lastLine) && !startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS)) { // read single author String author; StringBuilder institution = new StringBuilder(); boolean institutionDone; if (this.lastLine.indexOf('(') >= 0) { author = this.lastLine.substring(0, this.lastLine.indexOf('(')).trim(); institutionDone = this.lastLine.indexOf(')') >= 1; institution .append(this.lastLine.substring(this.lastLine.indexOf('(') + 1, institutionDone && (this.lastLine .indexOf(')') > (this.lastLine.indexOf('(') + 1)) ? this.lastLine .indexOf(')') : this.lastLine.length()) .trim()); } else { author = this.lastLine.substring(0, this.lastLine.length()).trim(); institutionDone = true; } readLine(in); while (!institutionDone && (this.lastLine != null)) { institutionDone = this.lastLine.indexOf(')') >= 1; institution.append(this.lastLine .substring(0, institutionDone ? this.lastLine.indexOf(')') : this.lastLine.length()).trim()); readLine(in); } authors.add(author); if (institution.length() > 0) { institutions.append( (institutions.length() == 0) ? institution.toString() : " and " + institution.toString()); } } if (!authors.isEmpty()) { be.setField(FieldName.AUTHOR, String.join(" and ", authors)); } if (institutions.length() > 0) { be.setField(FieldName.INSTITUTION, institutions.toString()); } } /** * Implements grammar rule "Abstract". * * @param be * @throws IOException */ private void parseAbstract(BibEntry be, BufferedReader in) throws IOException { String theabstract = readMultipleLines(in); if (!"".equals(theabstract)) { be.setField(FieldName.ABSTRACT, theabstract); } } /** * Implements grammar rule "AdditionalFields". * * @param be * @throws IOException */ private void parseAdditionalFields(BibEntry be, boolean multilineUrlFieldAllowed, BufferedReader in) throws IOException { // one empty line is possible before fields start if ((this.lastLine != null) && "".equals(this.lastLine.trim())) { readLine(in); } // read other fields while ((this.lastLine != null) && !isStartOfWorkingPaper() && (startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS) || "".equals(this.lastLine))) { // if multiple lines for a field are allowed and field consists of multiple lines, join them String keyword = "".equals(this.lastLine) ? "" : this.lastLine.substring(0, this.lastLine.indexOf(':')).trim(); // skip keyword this.lastLine = "".equals(this.lastLine) ? "" : this.lastLine.substring(this.lastLine.indexOf(':') + 1, this.lastLine.length()).trim(); // parse keywords field if ("Keywords".equals(keyword)) { String content = readMultipleLines(in); String[] keywords = content.split("[,;]"); be.addKeywords(Arrays.asList(keywords), importFormatPreferences.getKeywordSeparator()); // parse JEL field } else if ("JEL".equals(keyword)) { be.setField("jel", readMultipleLines(in)); } else if (keyword.startsWith("Date")) { // parse date field String content = readMultipleLines(in); Date.parse(content).ifPresent(be::setDate); // parse URL field } else if (keyword.startsWith("URL")) { String content; if (multilineUrlFieldAllowed) { content = readMultipleLines(in); } else { content = this.lastLine; readLine(in); } be.setField(FieldName.URL, content); // authors field } else if (keyword.startsWith("By")) { // parse authors parseAuthors(be, in); } else { readLine(in); } } } /** * if line starts with a string of the form 'x. ' and we are not in the overview * section, we have a working paper entry we are interested in */ private boolean isStartOfWorkingPaper() { return this.lastLine.matches("\\d+\\.\\s.*") && !this.inOverviewSection && "".equals(this.preLine.trim()); } @Override public ParserResult importDatabase(BufferedReader reader) throws IOException { Objects.requireNonNull(reader); List<BibEntry> bibitems = new ArrayList<>(); String paperNoStr = null; this.line = 0; try { readLine(reader); // skip header and editor information while (this.lastLine != null) { if (this.lastLine.startsWith("-----------------------------")) { this.inOverviewSection = this.preLine.startsWith("In this issue we have"); } if (isStartOfWorkingPaper()) { BibEntry be = new BibEntry(); be.setType("techreport"); paperNoStr = this.lastLine.substring(0, this.lastLine.indexOf('.')); parseTitleString(be, reader); if (startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS)) { parseAdditionalFields(be, false, reader); } else { readLine(reader); // skip empty line parseAuthors(be, reader); readLine(reader); // skip empty line } if (!startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS)) { parseAbstract(be, reader); } parseAdditionalFields(be, true, reader); bibitems.add(be); paperNoStr = null; } else { this.preLine = this.lastLine; readLine(reader); } } } catch (Exception e) { String message = "Error in REPEC-NEP import on line " + this.line; if (paperNoStr != null) { message += ", paper no. " + paperNoStr + ": "; } message += e.getLocalizedMessage(); LOGGER.error(message, e); return ParserResult.fromErrorMessage(message); } return new ParserResult(bibitems); } }