// Copyright 2014 Michel Kraemer // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package de.undercouch.citeproc; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Map; import org.jbibtex.BibTeXDatabase; import org.jbibtex.ParseException; import org.yaml.snakeyaml.Yaml; import de.undercouch.citeproc.bibtex.BibTeXConverter; import de.undercouch.citeproc.bibtex.BibTeXItemDataProvider; import de.undercouch.citeproc.csl.CSLItemData; import de.undercouch.citeproc.endnote.EndNoteConverter; import de.undercouch.citeproc.endnote.EndNoteItemDataProvider; import de.undercouch.citeproc.endnote.EndNoteLibrary; import de.undercouch.citeproc.helper.json.JsonLexer; import de.undercouch.citeproc.helper.json.JsonParser; import de.undercouch.citeproc.ris.RISConverter; import de.undercouch.citeproc.ris.RISItemDataProvider; import de.undercouch.citeproc.ris.RISLibrary; /** * A convenience class providing methods to load any supported kind of * bibliography files. The class automatically detects the correct file * format and returns a {@link ItemDataProvider} that holds all * bibliography items read from the file. * @author Michel Kraemer */ public class BibliographyFileReader { /** * Supported file formats for bibliography files */ public static enum FileFormat { /** * A BibTeX file */ BIBTEX, /** * A CSL JSON object */ JSON_OBJECT, /** * An array of CSL JSON objects */ JSON_ARRAY, /** * A YAML document * @since 1.1.0 */ YAML, /** * An EndNote file */ ENDNOTE, /** * An RIS file */ RIS, /** * Unknown file format */ UNKNOWN } /** * Reads all items from an input bibliography file and returns a provider * serving these items * @param bibfile the input file * @return the provider * @throws FileNotFoundException if the input file was not found * @throws IOException if the input file could not be read */ public ItemDataProvider readBibliographyFile(File bibfile) throws FileNotFoundException, IOException { //open buffered input stream to bibliography file if (!bibfile.exists()) { throw new FileNotFoundException("Bibliography file `" + bibfile.getName() + "' does not exist"); } try (BufferedInputStream bis = new BufferedInputStream( new FileInputStream(bibfile))) { return readBibliographyFile(bis, bibfile.getName()); } } /** * Reads all items from an input stream and returns a provider * serving these items. Note that you can supply an additional file * name to help the method to determine the exact bibliography file format. * If you don't know the file name you can pass null, but in this case the * method's result might try to read the input stream using the wrong * file format (depending on the input stream's contents). Also note * that the caller is responsible for closing the given input stream. * @param bibstream the input stream * @param filename the name of the input file (can be null if you don't * know the name) * @return the provider * @throws IOException if the input stream could not be read */ public ItemDataProvider readBibliographyFile(InputStream bibstream, String filename) throws IOException { BufferedInputStream bis; if (bibstream instanceof BufferedInputStream) { bis = (BufferedInputStream)bibstream; } else { bis = new BufferedInputStream(bibstream); } //determine file format FileFormat ff = determineFileFormat(bis, filename); //read stream return readBibliographyFile(bis, ff); } /** * Reads all items from an input stream using the given file format and * returns a provider serving these items. * @param bibstream the input stream * @param format the bibliography file format * @return the provider * @throws IOException if the input stream could not be read */ public ItemDataProvider readBibliographyFile(InputStream bibstream, FileFormat format) throws IOException { ItemDataProvider provider; try { //load bibliography file if (format == FileFormat.BIBTEX) { BibTeXDatabase db = new BibTeXConverter().loadDatabase(bibstream); BibTeXItemDataProvider bibtexprovider = new BibTeXItemDataProvider(); bibtexprovider.addDatabase(db); provider = bibtexprovider; } else if (format == FileFormat.JSON_ARRAY || format == FileFormat.JSON_OBJECT) { JsonParser parser = new JsonParser(new JsonLexer( new InputStreamReader(bibstream))); List<Object> objs; if (format == FileFormat.JSON_ARRAY) { objs = parser.parseArray(); } else { objs = new ArrayList<>(); objs.add(parser.parseObject()); } CSLItemData[] items = new CSLItemData[objs.size()]; for (int i = 0; i < items.length; ++i) { @SuppressWarnings("unchecked") Map<String, Object> obj = (Map<String, Object>)objs.get(i); items[i] = CSLItemData.fromJson(obj); } provider = new ListItemDataProvider(items); } else if (format == FileFormat.YAML) { Yaml yaml = new Yaml(); Iterable<Object> documentsIterable = yaml.loadAll(bibstream); List<List<Object>> documents = new ArrayList<>(); documentsIterable.forEach(o -> { if (o instanceof Map) { documents.add(Arrays.asList(o)); } else { documents.add(new ArrayList<>((Collection<?>)o)); } }); List<ItemDataProvider> providers = new ArrayList<>(); for (List<Object> objs : documents) { CSLItemData[] items = new CSLItemData[objs.size()]; for (int i = 0; i < items.length; ++i) { @SuppressWarnings("unchecked") Map<String, Object> obj = (Map<String, Object>)objs.get(i); items[i] = CSLItemData.fromJson(obj); } ItemDataProvider p = new ListItemDataProvider(items); providers.add(p); } if (providers.size() == 1) { provider = providers.get(0); } else { provider = new CompoundItemDataProvider(providers); } } else if (format == FileFormat.ENDNOTE) { EndNoteLibrary lib = new EndNoteConverter().loadLibrary(bibstream); EndNoteItemDataProvider endnoteprovider = new EndNoteItemDataProvider(); endnoteprovider.addLibrary(lib); provider = endnoteprovider; } else if (format == FileFormat.RIS) { RISLibrary lib = new RISConverter().loadLibrary(bibstream); RISItemDataProvider risprovider = new RISItemDataProvider(); risprovider.addLibrary(lib); provider = risprovider; } else { throw new IOException("Unknown bibliography file format"); } } catch (ParseException e) { throw new IOException("Could not parse bibliography file", e); } return provider; } /** * Reads the first 100 KB of the given bibliography file and tries * to determine the file format * @param bibfile the input file * @return the file format (or {@link FileFormat#UNKNOWN} if the format * could not be determined) * @throws FileNotFoundException if the input file was not found * @throws IOException if the input file could not be read */ public FileFormat determineFileFormat(File bibfile) throws FileNotFoundException, IOException { if (!bibfile.exists()) { throw new FileNotFoundException("Bibliography file `" + bibfile.getName() + "' does not exist"); } try (BufferedInputStream bis = new BufferedInputStream( new FileInputStream(bibfile))) { return determineFileFormat(bis, bibfile.getName()); } } /** * Reads the first bytes of the given input stream and tries to * determine the file format. Resets the input stream to the position * it had when the method was called. Reads up to 100 KB and before * giving up. Note that you can supply an additional file name to help * the method to determine the exact file format. If you don't know the * file name you can pass null, but in this case the method's result * might be wrong (depending on the input stream's contents). * @param bis a buffered input stream that supports the mark and reset * methods * @param filename the name of the input file (can be null if you don't * know the name) * @return the file format (or {@link FileFormat#UNKNOWN} if the format * could not be determined) * @throws IOException if the input stream could not be read */ public FileFormat determineFileFormat(BufferedInputStream bis, String filename) throws IOException { int len = 1024 * 100; String ext = ""; if (filename != null) { int dot = filename.lastIndexOf('.'); if (dot > 0) { ext = filename.substring(dot + 1); } } //check the first couple of bytes bis.mark(len); try { byte[] firstCharacters = new byte[6]; bis.read(firstCharacters); //check if the file starts with a %YAML directive if (firstCharacters[0] == '%' && firstCharacters[1] == 'Y' && firstCharacters[2] == 'A' && firstCharacters[3] == 'M' && firstCharacters[4] == 'L' && Character.isWhitespace(firstCharacters[5])) { return FileFormat.YAML; } //check if the file starts with an EndNote tag, but //also make sure the extension is not 'bib' or 'yml'/'yaml' //because BibTeX comments and YAML directives look like //EndNote tags if (firstCharacters[0] == '%' && Character.isWhitespace(firstCharacters[2]) && !ext.equalsIgnoreCase("bib") && !ext.equalsIgnoreCase("yaml") && !ext.equalsIgnoreCase("yml")) { return FileFormat.ENDNOTE; } //check if the file starts with a RIS type tag if (firstCharacters[0] == 'T' && firstCharacters[1] == 'Y' && Character.isWhitespace(firstCharacters[2]) && Character.isWhitespace(firstCharacters[3]) && firstCharacters[4] == '-') { return FileFormat.RIS; } } finally { bis.reset(); } //now check if it's json, bibtex or yaml bis.mark(len); try { while (true) { int c = bis.read(); --len; if (c < 0 || len < 2) { return FileFormat.UNKNOWN; } if (!Character.isWhitespace(c)) { if (c == '[') { return FileFormat.JSON_ARRAY; } else if (c == '{') { return FileFormat.JSON_OBJECT; } if (ext.equalsIgnoreCase("yaml") || ext.equalsIgnoreCase("yml")) { return FileFormat.YAML; } return FileFormat.BIBTEX; } } } finally { bis.reset(); } } }