/* Copyright (C) 2005 Andreas Rudert All programs in this directory and subdirectories are published under the GNU General Public License as described below. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Further information about the GNU GPL is available at: http://www.gnu.org/copyleft/gpl.ja.html */ package net.sf.jabref.imports; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; import net.sf.jabref.BibtexEntry; import net.sf.jabref.BibtexEntryType; import net.sf.jabref.Util; /** * Imports a New Economics Papers-Message from the REPEC-NEP Service. * * <p>{@link http://www.repec.org RePEc} (Research Papers in Economics) * is a collaborative effort of over 100 volunteers in 49 countries * to enhance the dissemination of research in economics. The heart of * the project is a decentralized database of working papers, journal * articles and software components. All RePEc material is freely available.</p> * At the time of writing RePEc holds over 300.000 items.</p> * * <p>{@link http://nep.repec.org NEP} (New Economic Papers) is an announcement * service which filters information on new additions to RePEc into edited * reports. The goal is to provide subscribers with up-to-date information * to the research literature.</p> * * <p>This importer is capable of importing NEP messages into JabRef.</p> * * <p>There is no officially defined message format for NEP. NEP messages are assumed to have * (and almost always have) the form given by the following semi-formal grammar: * <pre> * NEPMessage: * MessageSection NEPMessage * MessageSection * * MessageSection: * OverviewMessageSection * OtherMessageSection * * # we skip the overview * OverviewMessageSection: * 'In this issue we have: ' SectionSeparator OtherStuff * * OtherMessageSection: * SectionSeparator OtherMessageSectionContent * * # we skip other stuff and read only full working paper references * OtherMessageSectionContent: * WorkingPaper EmptyLine OtherMessageSectionContent * OtherStuff EmptyLine OtherMessageSectionContent * '' * * OtherStuff: * NonEmptyLine OtherStuff * NonEmptyLine * * NonEmptyLine: * a non-empty String that does not start with a number followed by a '.' * * # working papers are recognized by a number followed by a '.' * # in a non-overview section * WorkingPaper: * Number'.' WhiteSpace TitleString EmptyLine Authors EmptyLine Abstract AdditionalFields * Number'.' WhiteSpace TitleString AdditionalFields Abstract AdditionalFields * * TitleString: * a String that may span several lines and should be joined * * # there must be at least one author * Authors: * Author '\n' Authors * Author '\n' * * # optionally, an institution is given for an author * Author: * AuthorName * AuthorName '(' Institution ')' * * # there are no rules about the name, it may be firstname lastname or lastname, firstname or anything else * AuthorName: * a non-empty String without '(' or ')' characters, not spanning more that one line * * Institution: * a non-empty String that may span several lines * * Abstract: * a (possibly empty) String that may span several lines * * AdditionalFields: * AdditionalField '\n' AdditionalFields * EmptyLine AdditionalFields * '' * * AdditionalField: * 'Keywords:' KeywordList * 'URL:' non-empty String * 'Date:' DateString * 'JEL:' JelClassificationList * 'By': Authors * * KeywordList: * Keyword ',' KeywordList * Keyword ';' KeywordList * Keyword * * Keyword: * non-empty String that does not contain ',' (may contain whitespace) * * # if no date is given, the current year as given by the system clock is assumed * DateString: * 'yyyy-MM-dd' * 'yyyy-MM' * 'yyyy' * * JelClassificationList: * JelClassification JelClassificationList * JelClassification * * # the JEL Classifications are set into a new BIBTEX-field 'jel' * # they will appear if you add it as a field to one of the BIBTex Entry sections * JelClassification: * one of the allowed classes, see http://ideas.repec.org/j/ * * SectionSeparator: * '\n-----------------------------' * </pre> * </p> * * @see http://nep.repec.org * @author andreas_sf at rudert-home dot de */ public class RepecNepImporter extends ImportFormat { private final static Collection<String> recognizedFields = Arrays.asList(new String[]{"Keywords", "JEL", "Date", "URL", "By"}); private int line = 0; private String lastLine = ""; private String preLine = ""; private BufferedReader in = null; private boolean inOverviewSection = false; /** * Return the name of this import format. */ public String getFormatName() { return "REPEC New Economic Papers (NEP)"; } /* * (non-Javadoc) * @see net.sf.jabref.imports.ImportFormat#getCLIId() */ public String getCLIId() { return "repecnep"; } /* * (non-Javadoc) * @see net.sf.jabref.imports.ImportFormat#getExtensions() */ public String getExtensions() { return ".txt"; } /* * (non-Javadoc) * @see net.sf.jabref.imports.ImportFormat#getDescription() */ public String getDescription() { return "Imports a New Economics Papers-Message (see http://nep.repec.org)\n" + "from the REPEC-NEP Service (see http://www.repec.org).\n" + "To import papers either save a NEP message as a text file and then import or\n" + "copy&paste the papers you want to import and make sure, one of the first lines\n" + "contains the line \"nep.repec.org\"."; } /* * (non-Javadoc) * @see net.sf.jabref.imports.ImportFormat#isRecognizedFormat(java.io.InputStream) */ public boolean isRecognizedFormat(InputStream stream) throws IOException { // read the first couple of lines // NEP message usually contain the String 'NEP: New Economics Papers' // or, they are from nep.repec.org BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream)); String startOfMessage = ""; String line = in.readLine(); for (int i = 0; i < 25 && line != null; i++) { startOfMessage += line; line = in.readLine(); } return startOfMessage.indexOf("NEP: New Economics Papers") >= 0 || startOfMessage.indexOf("nep.repec.org") >= 0; } private boolean startsWithKeyword(Collection<String> keywords) { boolean result = this.lastLine.indexOf(':') > 0; if (result) { String possibleKeyword = this.lastLine.substring(0, this.lastLine.indexOf(':')); result = keywords.contains(possibleKeyword); } return result; } private void readLine() throws IOException { this.line++; this.preLine = this.lastLine; this.lastLine = this.in.readLine(); } /** * Read multiple lines. * * <p>Reads multiple lines until either * <ul> * <li>an empty line</li> * <li>the end of file</li> * <li>the next working paper or</li> * <li>a keyword</li> * </ul> * is found. Whitespace at start or end of lines is trimmed except for one blank character.</p> * * @return result */ private String readMultipleLines() throws IOException { String result = this.lastLine.trim(); readLine(); while (this.lastLine != null && !this.lastLine.trim().equals("") && !startsWithKeyword(recognizedFields) && !isStartOfWorkingPaper()) { result += this.lastLine.length() == 0 ? this.lastLine.trim() : " " + this.lastLine.trim(); readLine(); } return result; } /** * Implements grammar rule "TitleString". * * @param be * @throws IOException */ private void parseTitleString(BibtexEntry be) throws IOException { // skip article number this.lastLine = this.lastLine.substring(this.lastLine.indexOf('.') + 1, this.lastLine.length()); be.setField("title", readMultipleLines()); } /** * Implements grammer rule "Authors" * * @param be * @throws IOException */ private void parseAuthors(BibtexEntry be) throws IOException { // read authors and institutions String authors = ""; String institutions = ""; while (this.lastLine != null && !this.lastLine.equals("") && !startsWithKeyword(recognizedFields)) { // read single author String author = null; String institution = null; boolean institutionDone = false; if (this.lastLine.indexOf('(') >= 0) { author = this.lastLine.substring(0, this.lastLine.indexOf('(')).trim(); institutionDone = this.lastLine.indexOf(')') > 0; institution = this.lastLine.substring(this.lastLine.indexOf('(') + 1, institutionDone && this.lastLine.indexOf(')') > this.lastLine.indexOf('(') + 1 ? this.lastLine.indexOf(')') : this.lastLine.length()).trim(); } else { author = this.lastLine.substring(0, this.lastLine.length()).trim(); institutionDone = true; } readLine(); while (!institutionDone && this.lastLine!= null) { institutionDone = this.lastLine.indexOf(')') > 0; institution += this.lastLine.substring(0, institutionDone ? this.lastLine.indexOf(')') : this.lastLine.length()).trim(); readLine(); } if (author != null) { authors += !authors.equals("") ? " and " + author : "" + author; } if (institution != null) { institutions += !institutions.equals("") ? " and " + institution : "" + institution; } } if (!authors.equals("")) { be.setField("author", authors); } if (!institutions.equals("")) { be.setField("institution", institutions); } } /** * Implements grammar rule "Abstract". * * @param be * @throws IOException */ private void parseAbstract(BibtexEntry be) throws IOException { String theabstract = readMultipleLines(); if (!theabstract.equals("")) { be.setField("abstract", theabstract); } } /** * Implements grammar rule "AdditionalFields". * * @param be * @throws IOException */ private void parseAdditionalFields(BibtexEntry be, boolean multilineUrlFieldAllowed) throws IOException { // one empty line is possible before fields start if (this.lastLine != null && this.lastLine.trim().equals("")) { readLine(); } // read other fields while (this.lastLine != null && !isStartOfWorkingPaper() && (startsWithKeyword(recognizedFields) || this.lastLine.equals(""))) { // if multiple lines for a field are allowed and field consists of multiple lines, join them String keyword = this.lastLine.equals("") ? "" : this.lastLine.substring(0, this.lastLine.indexOf(':')).trim(); // skip keyword this.lastLine = this.lastLine.equals("") ? "" : this.lastLine.substring(this.lastLine.indexOf(':')+1, this.lastLine.length()).trim(); // parse keywords field if (keyword.equals("Keywords")) { String content = readMultipleLines(); String[] keywords = content.split("[,;]"); String keywordStr = ""; for (int i = 0; i < keywords.length; i++) { keywordStr += " '" + keywords[i].trim() + "'"; } be.setField("keywords", keywordStr.trim()); // parse JEL field } else if (keyword.equals("JEL")) { be.setField("jel", readMultipleLines()); // parse date field } else if (keyword.startsWith("Date")) { Date date = null; String content = readMultipleLines(); String[] recognizedDateFormats = new String[] {"yyyy-MM-dd","yyyy-MM","yyyy"}; int i = 0; for (; i < recognizedDateFormats.length && date == null; i++) { try { date = new SimpleDateFormat(recognizedDateFormats[i]).parse(content); } catch (ParseException e) { // wrong format } } Calendar cal = new GregorianCalendar(); cal.setTime(date != null ? date : new Date()); be.setField("year", "" + cal.get(Calendar.YEAR)); if (date != null && recognizedDateFormats[i-1].indexOf("MM") >= 0) { be.setField("month", "" + cal.get(Calendar.MONTH)); } // parse URL field } else if (keyword.startsWith("URL")) { String content = null; if (multilineUrlFieldAllowed) { content = readMultipleLines(); } else { content = this.lastLine; readLine(); } be.setField("url", content); // authors field } else if (keyword.startsWith("By")) { // parse authors parseAuthors(be); } else { readLine(); } } } /** * if line starts with a string of the form 'x. ' and we are not in the overview * section, we have a working paper entry we are interested in */ private boolean isStartOfWorkingPaper() { return this.lastLine.matches("\\d+\\.\\s.*") && !this.inOverviewSection && this.preLine.trim().equals(""); } /* * (non-Javadoc) * @see net.sf.jabref.imports.ImportFormat#importEntries(java.io.InputStream) */ public List<BibtexEntry> importEntries(InputStream stream) throws IOException { ArrayList<BibtexEntry> bibitems = new ArrayList<BibtexEntry>(); String paperNoStr = null; this.line = 0; try { this.in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream)); readLine(); // skip header and editor information while (this.lastLine != null) { if (this.lastLine.startsWith("-----------------------------")) { this.inOverviewSection = this.preLine.startsWith("In this issue we have"); } if (isStartOfWorkingPaper()) { BibtexEntry be = new BibtexEntry(Util.createNeutralId()); be.setType(BibtexEntryType.getType("techreport")); paperNoStr = this.lastLine.substring(0, this.lastLine.indexOf('.')); parseTitleString(be); if (startsWithKeyword(recognizedFields)) { parseAdditionalFields(be, false); } else { readLine(); // skip empty line parseAuthors(be); readLine(); // skip empty line } if (!startsWithKeyword(recognizedFields)) { parseAbstract(be); } parseAdditionalFields(be, true); bibitems.add(be); paperNoStr = null; } else { this.preLine = this.lastLine; readLine(); } } } catch (Exception e) { String message = "Error in REPEC-NEP import on line " + this.line; if (paperNoStr != null) { message += ", paper no. " + paperNoStr + ": "; } message += e.getMessage(); System.err.println(message); if (!(e instanceof IOException)) { e.printStackTrace(); e = new IOException(message); } throw (IOException)e; } return bibitems; } }