RepecNepImporter.java example

Explorer
jabref-master
- src
package org.jabref.logic.importer.fileformat;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Objects;

import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.util.FileExtensions;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.Date;
import org.jabref.model.entry.FieldName;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * Imports a New Economics Papers-Message from the REPEC-NEP Service.
 * <p>
 * <p><a href="http://www.repec.org">RePEc (Research Papers in Economics)</a>
 * is a collaborative effort of over 100 volunteers in 49 countries
 * to enhance the dissemination of research in economics. The heart of
 * the project is a decentralized database of working papers, journal
 * articles and software components. All RePEc material is freely available.</p>
 * At the time of writing RePEc holds over 300.000 items.</p>
 * <p>
 * <p><a href="http://nep.repec.org">NEP (New Economic Papers)</a> is an announcement
 * service which filters information on new additions to RePEc into edited
 * reports. The goal is to provide subscribers with up-to-date information
 * to the research literature.</p>
 * <p>
 * <p>This importer is capable of importing NEP messages into JabRef.</p>
 * <p>
 * <p>There is no officially defined message format for NEP. NEP messages are assumed to have
 * (and almost always have) the form given by the following semi-formal grammar:
 * <pre>
 * NEPMessage:
 *       MessageSection NEPMessage
 *       MessageSection
 *
 * MessageSection:
 *       OverviewMessageSection
 *       OtherMessageSection
 *
 * # we skip the overview
 * OverviewMessageSection:
 *       'In this issue we have: ' SectionSeparator OtherStuff
 *
 * OtherMessageSection:
 *       SectionSeparator  OtherMessageSectionContent
 *
 * # we skip other stuff and read only full working paper references
 * OtherMessageSectionContent:
 *       WorkingPaper EmptyLine OtherMessageSectionContent
 *       OtherStuff EmptyLine OtherMessageSectionContent
 *       ''
 *
 * OtherStuff:
 *       NonEmptyLine OtherStuff
 *       NonEmptyLine
 *
 * NonEmptyLine:
 *       a non-empty String that does not start with a number followed by a '.'
 *
 * # working papers are recognized by a number followed by a '.'
 * # in a non-overview section
 * WorkingPaper:
 *       Number'.' WhiteSpace TitleString EmptyLine Authors EmptyLine Abstract AdditionalFields
 *       Number'.' WhiteSpace TitleString AdditionalFields Abstract AdditionalFields
 *
 * TitleString:
 *       a String that may span several lines and should be joined
 *
 * # there must be at least one author
 * Authors:
 *       Author '\n' Authors
 *       Author '\n'
 *
 * # optionally, an institution is given for an author
 * Author:
 *       AuthorName
 *       AuthorName '(' Institution ')'
 *
 * # there are no rules about the name, it may be firstname lastname or lastname, firstname or anything else
 * AuthorName:
 *       a non-empty String without '(' or ')' characters, not spanning more that one line
 *
 * Institution:
 *       a non-empty String that may span several lines
 *
 * Abstract:
 *       a (possibly empty) String that may span several lines
 *
 * AdditionalFields:
 *       AdditionalField '\n' AdditionalFields
 *       EmptyLine AdditionalFields
 *       ''
 *
 * AdditionalField:
 *       'Keywords:' KeywordList
 *       'URL:' non-empty String
 *       'Date:' DateString
 *       'JEL:' JelClassificationList
 *       'By': Authors
 *
 * KeywordList:
 *        Keyword ',' KeywordList
 *        Keyword ';' KeywordList
 *        Keyword
 *
 * Keyword:
 *        non-empty String that does not contain ',' (may contain whitespace)
 *
 * # if no date is given, the current year as given by the system clock is assumed
 * DateString:
 *        'yyyy-MM-dd'
 *        'yyyy-MM'
 *        'yyyy'
 *
 * JelClassificationList:
 *        JelClassification JelClassificationList
 *        JelClassification
 *
 * # the JEL Classifications are set into a new BIBTEX-field 'jel'
 * # they will appear if you add it as a field to one of the BIBTex Entry sections
 * JelClassification:
 *        one of the allowed classes, see http://ideas.repec.org/j/
 *
 * SectionSeparator:
 *       '\n-----------------------------'
 * </pre>
 * </p>
 *
 * @author andreas_sf at rudert-home dot de
 * @see <a href="http://nep.repec.org">NEP</a>
 */
public class RepecNepImporter extends Importer {

    private static final Log LOGGER = LogFactory.getLog(RepecNepImporter.class);

    private static final Collection<String> RECOGNIZED_FIELDS = Arrays.asList("Keywords", "JEL", "Date", "URL", "By");
    private final ImportFormatPreferences importFormatPreferences;
    private int line;
    private String lastLine = "";
    private String preLine = "";
    private boolean inOverviewSection;


    public RepecNepImporter(ImportFormatPreferences importFormatPreferences) {
        this.importFormatPreferences = importFormatPreferences;
    }

    @Override
    public String getName() {
        return "REPEC New Economic Papers (NEP)";
    }

    @Override
    public String getId() {
        return "repecnep";
    }

    @Override
    public FileExtensions getExtensions() {
        return FileExtensions.REPEC;
    }

    @Override
    public String getDescription() {
        return "Imports a New Economics Papers-Message from the REPEC-NEP Service.";
    }

    @Override
    public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
        // read the first couple of lines
        // NEP message usually contain the String 'NEP: New Economics Papers'
        // or, they are from nep.repec.org
        StringBuilder startOfMessage = new StringBuilder();
        String tmpLine = reader.readLine();
        for (int i = 0; (i < 25) && (tmpLine != null); i++) {
            startOfMessage.append(tmpLine);
            tmpLine = reader.readLine();
        }
        return startOfMessage.toString().contains("NEP: New Economics Papers") || startOfMessage.toString().contains(
                "nep.repec.org");
    }

    private boolean startsWithKeyword(Collection<String> keywords) {
        boolean result = this.lastLine.indexOf(':') >= 1;
        if (result) {
            String possibleKeyword = this.lastLine.substring(0, this.lastLine.indexOf(':'));
            result = keywords.contains(possibleKeyword);
        }
        return result;
    }

    private void readLine(BufferedReader in) throws IOException {
        this.line++;
        this.preLine = this.lastLine;
        this.lastLine = in.readLine();
    }

    /**
     * Read multiple lines.
     * <p>
     * <p>Reads multiple lines until either
     * <ul>
     * <li>an empty line</li>
     * <li>the end of file</li>
     * <li>the next working paper or</li>
     * <li>a keyword</li>
     * </ul>
     * is found. Whitespace at start or end of lines is trimmed except for one blank character.</p>
     *
     * @return result
     */
    private String readMultipleLines(BufferedReader in) throws IOException {
        StringBuilder result = new StringBuilder(this.lastLine.trim());
        readLine(in);
        while ((this.lastLine != null) && !"".equals(this.lastLine.trim()) && !startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS) && !isStartOfWorkingPaper()) {
            result.append(this.lastLine.isEmpty() ? this.lastLine.trim() : " " + this.lastLine.trim());
            readLine(in);
        }
        return result.toString();
    }

    /**
     * Implements grammar rule "TitleString".
     *
     * @param be
     * @throws IOException
     */
    private void parseTitleString(BibEntry be, BufferedReader in) throws IOException {
        // skip article number
        this.lastLine = this.lastLine.substring(this.lastLine.indexOf('.') + 1, this.lastLine.length());
        be.setField(FieldName.TITLE, readMultipleLines(in));
    }

    /**
     * Implements grammar rule "Authors"
     *
     * @param be
     * @throws IOException
     */
    private void parseAuthors(BibEntry be, BufferedReader in) throws IOException {
        // read authors and institutions
        List<String> authors = new ArrayList<>();
        StringBuilder institutions = new StringBuilder();
        while ((this.lastLine != null) && !"".equals(this.lastLine) && !startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS)) {

            // read single author
            String author;
            StringBuilder institution = new StringBuilder();
            boolean institutionDone;
            if (this.lastLine.indexOf('(') >= 0) {
                author = this.lastLine.substring(0, this.lastLine.indexOf('(')).trim();
                institutionDone = this.lastLine.indexOf(')') >= 1;
                institution
                        .append(this.lastLine.substring(this.lastLine.indexOf('(') + 1,
                                institutionDone && (this.lastLine
                                        .indexOf(')') > (this.lastLine.indexOf('(') + 1)) ? this.lastLine
                                        .indexOf(')') : this.lastLine.length())
                                .trim());
            } else {
                author = this.lastLine.substring(0, this.lastLine.length()).trim();
                institutionDone = true;
            }

            readLine(in);
            while (!institutionDone && (this.lastLine != null)) {
                institutionDone = this.lastLine.indexOf(')') >= 1;
                institution.append(this.lastLine
                        .substring(0, institutionDone ? this.lastLine.indexOf(')') : this.lastLine.length()).trim());
                readLine(in);
            }

            authors.add(author);

            if (institution.length() > 0) {
                institutions.append(
                        (institutions.length() == 0) ? institution.toString() : " and " + institution.toString());
            }
        }

        if (!authors.isEmpty()) {
            be.setField(FieldName.AUTHOR, String.join(" and ", authors));
        }
        if (institutions.length() > 0) {
            be.setField(FieldName.INSTITUTION, institutions.toString());
        }
    }

    /**
     * Implements grammar rule "Abstract".
     *
     * @param be
     * @throws IOException
     */
    private void parseAbstract(BibEntry be, BufferedReader in) throws IOException {
        String theabstract = readMultipleLines(in);

        if (!"".equals(theabstract)) {
            be.setField(FieldName.ABSTRACT, theabstract);
        }
    }

    /**
     * Implements grammar rule "AdditionalFields".
     *
     * @param be
     * @throws IOException
     */
    private void parseAdditionalFields(BibEntry be, boolean multilineUrlFieldAllowed, BufferedReader in)
            throws IOException {

        // one empty line is possible before fields start
        if ((this.lastLine != null) && "".equals(this.lastLine.trim())) {
            readLine(in);
        }

        // read other fields
        while ((this.lastLine != null) && !isStartOfWorkingPaper() && (startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS) || "".equals(this.lastLine))) {

            // if multiple lines for a field are allowed and field consists of multiple lines, join them
            String keyword = "".equals(this.lastLine) ? "" : this.lastLine.substring(0, this.lastLine.indexOf(':')).trim();
            // skip keyword
            this.lastLine = "".equals(this.lastLine) ? "" : this.lastLine.substring(this.lastLine.indexOf(':') + 1, this.lastLine.length()).trim();

            // parse keywords field
            if ("Keywords".equals(keyword)) {
                String content = readMultipleLines(in);
                String[] keywords = content.split("[,;]");
                be.addKeywords(Arrays.asList(keywords),
                        importFormatPreferences.getKeywordSeparator());
                // parse JEL field
            } else if ("JEL".equals(keyword)) {
                be.setField("jel", readMultipleLines(in));

            } else if (keyword.startsWith("Date")) {
                // parse date field
                String content = readMultipleLines(in);
                Date.parse(content).ifPresent(be::setDate);
                // parse URL field
            } else if (keyword.startsWith("URL")) {
                String content;
                if (multilineUrlFieldAllowed) {
                    content = readMultipleLines(in);
                } else {
                    content = this.lastLine;
                    readLine(in);
                }
                be.setField(FieldName.URL, content);

                // authors field
            } else if (keyword.startsWith("By")) {
                // parse authors
                parseAuthors(be, in);
            } else {
                readLine(in);
            }
        }
    }

    /**
     * if line starts with a string of the form 'x. ' and we are not in the overview
     * section, we have a working paper entry we are interested in
     */
    private boolean isStartOfWorkingPaper() {
        return this.lastLine.matches("\\d+\\.\\s.*") && !this.inOverviewSection && "".equals(this.preLine.trim());
    }

    @Override
    public ParserResult importDatabase(BufferedReader reader) throws IOException {
        Objects.requireNonNull(reader);

        List<BibEntry> bibitems = new ArrayList<>();
        String paperNoStr = null;
        this.line = 0;
        try {
            readLine(reader); // skip header and editor information
            while (this.lastLine != null) {

                if (this.lastLine.startsWith("-----------------------------")) {
                    this.inOverviewSection = this.preLine.startsWith("In this issue we have");
                }
                if (isStartOfWorkingPaper()) {
                    BibEntry be = new BibEntry();
                    be.setType("techreport");
                    paperNoStr = this.lastLine.substring(0, this.lastLine.indexOf('.'));
                    parseTitleString(be, reader);
                    if (startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS)) {
                        parseAdditionalFields(be, false, reader);
                    } else {
                        readLine(reader); // skip empty line
                        parseAuthors(be, reader);
                        readLine(reader); // skip empty line
                    }
                    if (!startsWithKeyword(RepecNepImporter.RECOGNIZED_FIELDS)) {
                        parseAbstract(be, reader);
                    }
                    parseAdditionalFields(be, true, reader);

                    bibitems.add(be);
                    paperNoStr = null;

                } else {
                    this.preLine = this.lastLine;
                    readLine(reader);
                }
            }

        } catch (Exception e) {
            String message = "Error in REPEC-NEP import on line " + this.line;
            if (paperNoStr != null) {
                message += ", paper no. " + paperNoStr + ": ";
            }
            message += e.getLocalizedMessage();
            LOGGER.error(message, e);
            return ParserResult.fromErrorMessage(message);
        }

        return new ParserResult(bibitems);
    }
}