// Copyright 2013 The Docear Project and Michel Kraemer
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package de.undercouch.citeproc.bibtex;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jbibtex.BibTeXDatabase;
import org.jbibtex.BibTeXEntry;
import org.jbibtex.BibTeXParser;
import org.jbibtex.BibTeXString;
import org.jbibtex.Key;
import org.jbibtex.LaTeXObject;
import org.jbibtex.LaTeXParser;
import org.jbibtex.LaTeXPrinter;
import org.jbibtex.ParseException;
import org.jbibtex.TokenMgrException;
import org.jbibtex.Value;
import de.undercouch.citeproc.csl.CSLDate;
import de.undercouch.citeproc.csl.CSLItemData;
import de.undercouch.citeproc.csl.CSLItemDataBuilder;
import de.undercouch.citeproc.csl.CSLType;
/**
* <p>Converts BibTeX items to CSL citation items</p>
* <p>The class maps BibTeX attributes to CSL attributes. The mapping is
* based on the one used in <a href="http://www.docear.org">Docear</a> as
* <a href="http://www.docear.org/2012/08/08/docear4word-mapping-bibtex-fields-and-types-with-the-citation-style-language/">presented
* by Joeran Beel</a>.</p>
* <p>Docear is released under the GPLv2 but its code may also be reused in
* projects licensed under Apache License 2.0 (see
* <a href="http://www.docear.org/software/licence/">http://www.docear.org/software/licence/</a>,
* last visited 2013-09-06). The mapping here is released under the
* Apache License 2.0 by permission of Joaran Beel, Docear.</p>
* @author Joaran Beel
* @author Michel Kraemer
*/
public class BibTeXConverter {
private static final String FIELD_ABSTRACT = "abstract";
private static final String FIELD_ACCESSED = "accessed";
private static final String FIELD_ADDRESS = "address";
private static final String FIELD_ANNOTE = "annote";
private static final String FIELD_AUTHOR = "author";
private static final String FIELD_BOOKTITLE = "booktitle";
private static final String FIELD_CHAPTER = "chapter";
private static final String FIELD_DATE = "date";
private static final String FIELD_DOI = "doi";
private static final String FIELD_EDITION = "edition";
private static final String FIELD_EDITOR = "editor";
private static final String FIELD_INSTITUTION = "institution";
private static final String FIELD_ISBN = "isbn";
private static final String FIELD_ISSN = "issn";
private static final String FIELD_ISSUE = "issue";
private static final String FIELD_JOURNAL = "journal";
private static final String FIELD_KEYWORDS = "keywords";
private static final String FIELD_LOCATION = "location";
private static final String FIELD_MONTH = "month";
private static final String FIELD_NOTE = "note";
private static final String FIELD_NUMBER = "number";
private static final String FIELD_ORGANIZATION = "organization";
private static final String FIELD_PAGES = "pages";
private static final String FIELD_PUBLISHER = "publisher";
private static final String FIELD_REVISION = "revision";
private static final String FIELD_SCHOOL = "school";
private static final String FIELD_SERIES = "series";
private static final String FIELD_STATUS = "status";
private static final String FIELD_TITLE = "title";
private static final String FIELD_URL = "url";
private static final String FIELD_URLDATE = "urldate";
private static final String FIELD_VOLUME = "volume";
private static final String FIELD_YEAR = "year";
private static final String TYPE_ARTICLE = "article";
private static final String TYPE_BOOK = "book";
private static final String TYPE_BOOKLET = "booklet";
private static final String TYPE_CONFERENCE = "conference";
private static final String TYPE_ELECTRONIC = "electronic";
private static final String TYPE_INBOOK = "inbook";
private static final String TYPE_INCOLLECTION = "incollection";
private static final String TYPE_INPROCEEDINGS = "inproceedings";
private static final String TYPE_MANUAL = "manual";
private static final String TYPE_MASTERSTHESIS = "mastersthesis";
private static final String TYPE_ONLINE = "online";
private static final String TYPE_PATENT = "patent";
private static final String TYPE_PERIODICAL = "periodical";
private static final String TYPE_PHDTHESIS = "phdthesis";
private static final String TYPE_PROCEEDINGS = "proceedings";
private static final String TYPE_STANDARD = "standard";
private static final String TYPE_TECHREPORT = "techreport";
private static final String TYPE_UNPUBLISHED = "unpublished";
private static final String TYPE_WWW = "www";
private final LaTeXParser latexParser;
private final LaTeXPrinter latexPrinter;
/**
* Default constructor
*/
public BibTeXConverter() {
try {
latexParser = new LaTeXParser();
} catch (ParseException e) {
// can actually never happen because the default constructor
// of LaTeXParser doesn't throw
throw new RuntimeException(e);
}
latexPrinter = new LaTeXPrinter();
}
/**
* <p>Loads a BibTeX database from a stream.</p>
* <p>This method does not close the given stream. The caller is
* responsible for closing it.</p>
* @param is the input stream to read from
* @return the BibTeX database
* @throws IOException if the database could not be read
* @throws ParseException if the database is invalid
*/
public BibTeXDatabase loadDatabase(InputStream is) throws IOException, ParseException {
Reader reader = new InputStreamReader(is, "UTF-8");
BibTeXParser parser = new BibTeXParser() {
@Override
public void checkStringResolution(Key key, BibTeXString string) {
if (string == null) {
//ignore
}
}
};
try {
return parser.parse(reader);
} catch (TokenMgrException err) {
throw new ParseException("Could not parse BibTeX library: " +
err.getMessage());
}
}
/**
* Converts the given database to a map of CSL citation items
* @param db the database
* @return a map consisting of citation keys and citation items
*/
public Map<String, CSLItemData> toItemData(BibTeXDatabase db) {
Map<String, CSLItemData> result = new HashMap<>();
for (Map.Entry<Key, BibTeXEntry> e : db.getEntries().entrySet()) {
result.put(e.getKey().getValue(), toItemData(e.getValue()));
}
return result;
}
/**
* Converts a BibTeX entry to a citation item
* @param e the BibTeX entry to convert
* @return the citation item
*/
public CSLItemData toItemData(BibTeXEntry e) {
//get all fields from the BibTeX entry
Map<String, String> entries = new HashMap<>();
for (Map.Entry<Key, Value> field : e.getFields().entrySet()) {
String us = field.getValue().toUserString().replaceAll("\\r", "");
//convert LaTeX string to normal text
try {
List<LaTeXObject> objs = latexParser.parse(new StringReader(us));
us = latexPrinter.print(objs).replaceAll("\\n", " ").replaceAll("\\r", "").trim();
} catch (ParseException ex) {
//ignore
} catch (TokenMgrException err) {
//ignore
}
entries.put(field.getKey().getValue().toLowerCase(), us);
}
//map type
CSLType type = toType(e.getType());
CSLItemDataBuilder builder = new CSLItemDataBuilder()
.id(e.getKey().getValue()).type(type);
//map address
if (entries.containsKey(FIELD_LOCATION)) {
builder.eventPlace(entries.get(FIELD_LOCATION));
builder.publisherPlace(entries.get(FIELD_LOCATION));
} else {
builder.eventPlace(entries.get(FIELD_ADDRESS));
builder.publisherPlace(entries.get(FIELD_ADDRESS));
}
//map author
if (entries.containsKey(FIELD_AUTHOR)) {
builder.author(NameParser.parse(entries.get(FIELD_AUTHOR)));
}
//map editor
if (entries.containsKey(FIELD_EDITOR)) {
builder.editor(NameParser.parse(entries.get(FIELD_EDITOR)));
builder.collectionEditor(NameParser.parse(entries.get(FIELD_EDITOR)));
builder.containerAuthor(NameParser.parse(entries.get(FIELD_EDITOR)));
}
//map date
if (type == CSLType.WEBPAGE && entries.containsKey(FIELD_URLDATE)) {
CSLDate date = DateParser.toDate(entries.get(FIELD_URLDATE));
builder.issued(date);
} else if (entries.containsKey(FIELD_DATE)) {
CSLDate date = DateParser.toDate(entries.get(FIELD_DATE));
builder.issued(date);
builder.eventDate(date);
} else {
CSLDate date = DateParser.toDate(entries.get(FIELD_YEAR), entries.get(FIELD_MONTH));
builder.issued(date);
builder.eventDate(date);
}
//map journal, booktitle, series
if (entries.containsKey(FIELD_JOURNAL)) {
builder.containerTitle(entries.get(FIELD_JOURNAL));
builder.collectionTitle(entries.get(FIELD_JOURNAL));
} else if (entries.containsKey(FIELD_BOOKTITLE)) {
builder.containerTitle(entries.get(FIELD_BOOKTITLE));
builder.collectionTitle(entries.get(FIELD_BOOKTITLE));
} else {
builder.containerTitle(entries.get(FIELD_SERIES));
builder.collectionTitle(entries.get(FIELD_SERIES));
}
//map number and issue
builder.number(entries.get(FIELD_NUMBER));
if (entries.containsKey(FIELD_ISSUE)) {
builder.issue(entries.get(FIELD_ISSUE));
} else {
builder.issue(entries.get(FIELD_NUMBER));
}
//map publisher, insitution, school, organisation
if (type == CSLType.REPORT) {
if (entries.containsKey(FIELD_PUBLISHER)) {
builder.publisher(entries.get(FIELD_PUBLISHER));
} else if (entries.containsKey(FIELD_INSTITUTION)) {
builder.publisher(entries.get(FIELD_INSTITUTION));
} else if (entries.containsKey(FIELD_SCHOOL)) {
builder.publisher(entries.get(FIELD_SCHOOL));
} else {
builder.publisher(entries.get(FIELD_ORGANIZATION));
}
} else if (type == CSLType.THESIS) {
if (entries.containsKey(FIELD_PUBLISHER)) {
builder.publisher(entries.get(FIELD_PUBLISHER));
} else if (entries.containsKey(FIELD_SCHOOL)) {
builder.publisher(entries.get(FIELD_SCHOOL));
} else if (entries.containsKey(FIELD_INSTITUTION)) {
builder.publisher(entries.get(FIELD_INSTITUTION));
} else {
builder.publisher(entries.get(FIELD_ORGANIZATION));
}
} else {
if (entries.containsKey(FIELD_PUBLISHER)) {
builder.publisher(entries.get(FIELD_PUBLISHER));
} else if (entries.containsKey(FIELD_ORGANIZATION)) {
builder.publisher(entries.get(FIELD_ORGANIZATION));
} else if (entries.containsKey(FIELD_INSTITUTION)) {
builder.publisher(entries.get(FIELD_INSTITUTION));
} else {
builder.publisher(entries.get(FIELD_SCHOOL));
}
}
//map title or chapter
if (entries.containsKey(FIELD_TITLE)) {
builder.title(entries.get(FIELD_TITLE));
} else {
builder.title(entries.get(FIELD_CHAPTER));
}
//map pages
String pages = entries.get(FIELD_PAGES);
if (pages != null) {
PageRange pr = PageParser.parse(pages);
builder.page(pr.getLiteral());
builder.pageFirst(pr.getPageFirst());
if (pr.getNumberOfPages() != null) {
builder.numberOfPages(String.valueOf(pr.getNumberOfPages()));
}
}
//map last accessed date
if (entries.containsKey(FIELD_ACCESSED)) {
builder.accessed(DateParser.toDate(entries.get(FIELD_ACCESSED)));
}
//map other attributes
builder.volume(entries.get(FIELD_VOLUME));
builder.keyword(entries.get(FIELD_KEYWORDS));
builder.URL(entries.get(FIELD_URL));
builder.status(entries.get(FIELD_STATUS));
builder.ISSN(entries.get(FIELD_ISSN));
builder.ISBN(entries.get(FIELD_ISBN));
builder.version(entries.get(FIELD_REVISION));
builder.annote(entries.get(FIELD_ANNOTE));
builder.edition(entries.get(FIELD_EDITION));
builder.abstrct(entries.get(FIELD_ABSTRACT));
builder.DOI(entries.get(FIELD_DOI));
builder.note(entries.get(FIELD_NOTE));
//create citation item
return builder.build();
}
/**
* Converts a BibTeX type to a CSL type
* @param type the type to convert
* @return the converted type (never null, falls back to {@link CSLType#ARTICLE})
*/
public CSLType toType(Key type) {
String s = type.getValue();
if (s.equalsIgnoreCase(TYPE_ARTICLE)) {
return CSLType.ARTICLE_JOURNAL;
} else if (s.equalsIgnoreCase(TYPE_PROCEEDINGS)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_MANUAL)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_BOOK)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_PERIODICAL)) {
return CSLType.BOOK;
} else if (s.equalsIgnoreCase(TYPE_BOOKLET)) {
return CSLType.PAMPHLET;
} else if (s.equalsIgnoreCase(TYPE_INBOOK)) {
return CSLType.CHAPTER;
} else if (s.equalsIgnoreCase(TYPE_INCOLLECTION)) {
return CSLType.CHAPTER;
} else if (s.equalsIgnoreCase(TYPE_INPROCEEDINGS)) {
return CSLType.PAPER_CONFERENCE;
} else if (s.equalsIgnoreCase(TYPE_CONFERENCE)) {
return CSLType.PAPER_CONFERENCE;
} else if (s.equalsIgnoreCase(TYPE_MASTERSTHESIS)) {
return CSLType.THESIS;
} else if (s.equalsIgnoreCase(TYPE_PHDTHESIS)) {
return CSLType.THESIS;
} else if (s.equalsIgnoreCase(TYPE_TECHREPORT)) {
return CSLType.REPORT;
} else if (s.equalsIgnoreCase(TYPE_PATENT)) {
return CSLType.PATENT;
} else if (s.equalsIgnoreCase(TYPE_ELECTRONIC)) {
return CSLType.WEBPAGE;
} else if (s.equalsIgnoreCase(TYPE_ONLINE)) {
return CSLType.WEBPAGE;
} else if (s.equalsIgnoreCase(TYPE_WWW)) {
return CSLType.WEBPAGE;
} else if (s.equalsIgnoreCase(TYPE_STANDARD)) {
return CSLType.LEGISLATION;
} else if (s.equalsIgnoreCase(TYPE_UNPUBLISHED)) {
return CSLType.MANUSCRIPT;
}
return CSLType.ARTICLE;
}
}