BibliographyFileReader.java example

Explorer

citeproc-java-master
- citeproc-java
  - src
    - main
      - java
        de
        undercouch
        citeproc
        AbbreviationProvider.java
        BibliographyFileReader.java
        CSL.java
        CSLDateParser.java
        CompoundItemDataProvider.java
        DefaultAbbreviationProvider.java
        DefaultLocaleProvider.java
        ItemDataProvider.java
        ListItemDataProvider.java
        LocaleProvider.java
        VariableWrapper.java
        bibtex
        BibTeXConverter.java
        BibTeXItemDataProvider.java
        DateParser.java
        NameParser.java
        PageParser.java
        PageRange.java
        csl
        CitationIDIndexPair.java
        helper
        CSLUtils.java
        Levenshtein.java
        StringHelper.java
        json
        JsonBuilder.java
        JsonBuilderFactory.java
        JsonLexer.java
        JsonObject.java
        JsonParser.java
        MapJsonBuilder.java
        MapJsonBuilderFactory.java
        StringJsonBuilder.java
        StringJsonBuilderFactory.java
        oauth
        AuthenticationStore.java
        FileAuthenticationStore.java
        OAuth.java
        OAuth1.java
        OAuth2.java
        PercentEncoding.java
        RequestException.java
        Response.java
        Token.java
        UnauthorizedException.java
        lint
        AbstractLinter.java
        DefaultLintErrorEvent.java
        DefaultLintEvent.java
        DefaultLintItemEvent.java
        DefaultLinter.java
        DuplicateCitationIdLinter.java
        LintErrorEvent.java
        LintEvent.java
        LintFilter.java
        LintItemEvent.java
        LintListener.java
        LintListenerAdapter.java
        Linter.java
        mendeley
        MendeleyConnector.java
        MendeleyConverter.java
        remote
        AbstractRemoteConnector.java
        AuthenticatedRemoteConnector.java
        RemoteConnector.java
        RemoteConnectorAdapter.java
        script
        AbstractScriptRunner.java
        JREScriptRunner.java
        ScriptRunner.java
        ScriptRunnerException.java
        ScriptRunnerFactory.java
        V8ScriptRunner.java
        zotero
        ZoteroConnector.java
        ZoteroItemDataProvider.java
        ZoteroOAuth.java
    - test
      - java
        de
        undercouch
        citeproc
        CSLDateParserTest.java
        CSLTest.java
        CompoundItemDataProviderTest.java
        TestSuiteRunner.java
        bibtex
        AbstractBibTeXTest.java
        BibTeXConverterTest.java
        BibTeXItemDataProviderTest.java
        DateParserTest.java
        NameParserTest.java
        PageParserTest.java
        csl
        CSLCitationItemBuilderTest.java
        endnote
        EndNoteConverterTest.java
        EndNoteParserTest.java
        helper
        LevenshteinTest.java
        StringHelperTest.java
        json
        JsonLexerTest.java
        JsonParserTest.java
        MapJsonBuilderTest.java
        StringJsonBuilderTest.java
        oauth
        FileAuthenticationStoreTest.java
        mendeley
        MendeleyConverterTest.java
        ris
        RISConverterTest.java
        RISParserTest.java
        script
        ScriptRunnerBenchmark.java
  - templates
- citeproc-java-tool
  - src
    - main
      - java
        de
        undercouch
        citeproc
        CSLTool.java
        helper
        tool
        CachingBibliographyFileReader.java
        CachingRemoteConnector.java
        ToolUtils.java
        tool
        AbstractCSLToolCommand.java
        AbstractRemoteCommand.java
        BibliographyCommand.java
        CSLToolCommand.java
        CSLToolContext.java
        CitationCommand.java
        CitationIdsCommand.java
        HelpCommand.java
        InputFileCommand.java
        JsonCommand.java
        LintCommand.java
        ListCommand.java
        MendeleyBibliographyCommand.java
        MendeleyCitationCommand.java
        MendeleyCommand.java
        MendeleyJsonCommand.java
        MendeleyListCommand.java
        MendeleySyncCommand.java
        NeedsSynchronization.java
        ProviderCommand.java
        ShellCommand.java
        ZoteroBibliographyCommand.java
        ZoteroCitationCommand.java
        ZoteroCommand.java
        ZoteroJsonCommand.java
        ZoteroListCommand.java
        ZoteroSyncCommand.java
        shell
        AdditionalShellCommands.java
        ConsoleInputReader.java
        ErrorOutputStream.java
        ShellCommandCompleter.java
        ShellCommandParser.java
        ShellContext.java
        ShellExitCommand.java
        ShellGetCommand.java
        ShellGetFormatCommand.java
        ShellGetLocaleCommand.java
        ShellGetStyleCommand.java
        ShellHelpCommand.java
        ShellLoadCommand.java
        ShellQuitCommand.java
        ShellSetCommand.java
        ShellSetFormatCommand.java
        ShellSetLocaleCommand.java
        ShellSetStyleCommand.java
    - test
      - java
        de
        undercouch
        citeproc
        tool
        shell
        ShellCommandCompleterTest.java
        ShellCommandParserTest.java

// Copyright 2014 Michel Kraemer
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package de.undercouch.citeproc;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;

import org.jbibtex.BibTeXDatabase;
import org.jbibtex.ParseException;
import org.yaml.snakeyaml.Yaml;

import de.undercouch.citeproc.bibtex.BibTeXConverter;
import de.undercouch.citeproc.bibtex.BibTeXItemDataProvider;
import de.undercouch.citeproc.csl.CSLItemData;
import de.undercouch.citeproc.endnote.EndNoteConverter;
import de.undercouch.citeproc.endnote.EndNoteItemDataProvider;
import de.undercouch.citeproc.endnote.EndNoteLibrary;
import de.undercouch.citeproc.helper.json.JsonLexer;
import de.undercouch.citeproc.helper.json.JsonParser;
import de.undercouch.citeproc.ris.RISConverter;
import de.undercouch.citeproc.ris.RISItemDataProvider;
import de.undercouch.citeproc.ris.RISLibrary;

/**
 * A convenience class providing methods to load any supported kind of
 * bibliography files. The class automatically detects the correct file
 * format and returns a {@link ItemDataProvider} that holds all
 * bibliography items read from the file.
 * @author Michel Kraemer
 */
public class BibliographyFileReader {
	/**
	 * Supported file formats for bibliography files
	 */
	public static enum FileFormat {
		/**
		 * A BibTeX file
		 */
		BIBTEX,
		
		/**
		 * A CSL JSON object
		 */
		JSON_OBJECT,
		
		/**
		 * An array of CSL JSON objects
		 */
		JSON_ARRAY,
		
		/**
		 * A YAML document
		 * @since 1.1.0
		 */
		YAML,
		
		/**
		 * An EndNote file
		 */
		ENDNOTE,
		
		/**
		 * An RIS file
		 */
		RIS,
		
		/**
		 * Unknown file format
		 */
		UNKNOWN
	}
	
	/**
	 * Reads all items from an input bibliography file and returns a provider
	 * serving these items
	 * @param bibfile the input file
	 * @return the provider
	 * @throws FileNotFoundException if the input file was not found
	 * @throws IOException if the input file could not be read
	 */
	public ItemDataProvider readBibliographyFile(File bibfile)
			throws FileNotFoundException, IOException {
		//open buffered input stream to bibliography file
		if (!bibfile.exists()) {
			throw new FileNotFoundException("Bibliography file `" + 
					bibfile.getName() + "' does not exist");
		}
		try (BufferedInputStream bis = new BufferedInputStream(
				new FileInputStream(bibfile))) {
			return readBibliographyFile(bis, bibfile.getName());
		}
	}
	
	/**
	 * Reads all items from an input stream and returns a provider
	 * serving these items. Note that you can supply an additional file
	 * name to help the method to determine the exact bibliography file format.
	 * If you don't know the file name you can pass null, but in this case the
	 * method's result might try to read the input stream using the wrong
	 * file format (depending on the input stream's contents). Also note
	 * that the caller is responsible for closing the given input stream.
	 * @param bibstream the input stream
	 * @param filename the name of the input file (can be null if you don't
	 * know the name)
	 * @return the provider
	 * @throws IOException if the input stream could not be read
	 */
	public ItemDataProvider readBibliographyFile(InputStream bibstream,
			String filename) throws IOException {
		BufferedInputStream bis;
		if (bibstream instanceof BufferedInputStream) {
			bis = (BufferedInputStream)bibstream;
		} else {
			bis = new BufferedInputStream(bibstream);
		}
		
		//determine file format
		FileFormat ff = determineFileFormat(bis, filename);
		
		//read stream
		return readBibliographyFile(bis, ff);
	}
	
	/**
	 * Reads all items from an input stream using the given file format and
	 * returns a provider serving these items.
	 * @param bibstream the input stream
	 * @param format the bibliography file format 
	 * @return the provider
	 * @throws IOException if the input stream could not be read
	 */
	public ItemDataProvider readBibliographyFile(InputStream bibstream,
			FileFormat format) throws IOException {
		ItemDataProvider provider;
		try {
			//load bibliography file
			if (format == FileFormat.BIBTEX) {
				BibTeXDatabase db = new BibTeXConverter().loadDatabase(bibstream);
				BibTeXItemDataProvider bibtexprovider = new BibTeXItemDataProvider();
				bibtexprovider.addDatabase(db);
				provider = bibtexprovider;
			} else if (format == FileFormat.JSON_ARRAY ||
					format == FileFormat.JSON_OBJECT) {
				JsonParser parser = new JsonParser(new JsonLexer(
						new InputStreamReader(bibstream)));
				List<Object> objs;
				if (format == FileFormat.JSON_ARRAY) {
					objs = parser.parseArray();
				} else {
					objs = new ArrayList<>();
					objs.add(parser.parseObject());
				}
				CSLItemData[] items = new CSLItemData[objs.size()];
				for (int i = 0; i < items.length; ++i) {
					@SuppressWarnings("unchecked")
					Map<String, Object> obj = (Map<String, Object>)objs.get(i);
					items[i] = CSLItemData.fromJson(obj);
				}
				provider = new ListItemDataProvider(items);
			} else if (format == FileFormat.YAML) {
				Yaml yaml = new Yaml();
				Iterable<Object> documentsIterable = yaml.loadAll(bibstream);
				List<List<Object>> documents = new ArrayList<>();
				documentsIterable.forEach(o -> {
					if (o instanceof Map) {
						documents.add(Arrays.asList(o));
					} else {
						documents.add(new ArrayList<>((Collection<?>)o));
					}
				});
				List<ItemDataProvider> providers = new ArrayList<>();
				for (List<Object> objs : documents) {
					CSLItemData[] items = new CSLItemData[objs.size()];
					for (int i = 0; i < items.length; ++i) {
						@SuppressWarnings("unchecked")
						Map<String, Object> obj = (Map<String, Object>)objs.get(i);
						items[i] = CSLItemData.fromJson(obj);
					}
					ItemDataProvider p = new ListItemDataProvider(items);
					providers.add(p);
				}
				if (providers.size() == 1) {
					provider = providers.get(0);
				} else {
					provider = new CompoundItemDataProvider(providers);
				}
			} else if (format == FileFormat.ENDNOTE) {
				EndNoteLibrary lib = new EndNoteConverter().loadLibrary(bibstream);
				EndNoteItemDataProvider endnoteprovider = new EndNoteItemDataProvider();
				endnoteprovider.addLibrary(lib);
				provider = endnoteprovider;
			} else if (format == FileFormat.RIS) {
				RISLibrary lib = new RISConverter().loadLibrary(bibstream);
				RISItemDataProvider risprovider = new RISItemDataProvider();
				risprovider.addLibrary(lib);
				provider = risprovider;
			} else {
				throw new IOException("Unknown bibliography file format");
			}
		} catch (ParseException e) {
			throw new IOException("Could not parse bibliography file", e);
		}
		
		return provider;
	}
	
	/**
	 * Reads the first 100 KB of the given bibliography file and tries
	 * to determine the file format
	 * @param bibfile the input file
	 * @return the file format (or {@link FileFormat#UNKNOWN} if the format
	 * could not be determined)
	 * @throws FileNotFoundException if the input file was not found
	 * @throws IOException if the input file could not be read
	 */
	public FileFormat determineFileFormat(File bibfile)
			throws FileNotFoundException, IOException {
		if (!bibfile.exists()) {
			throw new FileNotFoundException("Bibliography file `" + 
					bibfile.getName() + "' does not exist");
		}
		try (BufferedInputStream bis = new BufferedInputStream(
				new FileInputStream(bibfile))) {
			return determineFileFormat(bis, bibfile.getName());
		}
	}
	
	/**
	 * Reads the first bytes of the given input stream and tries to
	 * determine the file format. Resets the input stream to the position
	 * it had when the method was called. Reads up to 100 KB and before
	 * giving up. Note that you can supply an additional file name to help
	 * the method to determine the exact file format. If you don't know the
	 * file name you can pass null, but in this case the method's result
	 * might be wrong (depending on the input stream's contents).
	 * @param bis a buffered input stream that supports the mark and reset
	 * methods
	 * @param filename the name of the input file (can be null if you don't
	 * know the name)
	 * @return the file format (or {@link FileFormat#UNKNOWN} if the format
	 * could not be determined)
	 * @throws IOException if the input stream could not be read
	 */
	public FileFormat determineFileFormat(BufferedInputStream bis,
			String filename) throws IOException {
		int len = 1024 * 100;
		
		String ext = "";
		if (filename != null) {
			int dot = filename.lastIndexOf('.');
			if (dot > 0) {
				ext = filename.substring(dot + 1);
			}
		}
		
		//check the first couple of bytes
		bis.mark(len);
		try {
			byte[] firstCharacters = new byte[6];
			bis.read(firstCharacters);
			
			//check if the file starts with a %YAML directive
			if (firstCharacters[0] == '%' &&
					firstCharacters[1] == 'Y' &&
					firstCharacters[2] == 'A' &&
					firstCharacters[3] == 'M' &&
					firstCharacters[4] == 'L' &&
					Character.isWhitespace(firstCharacters[5])) {
				return FileFormat.YAML;
			}
			
			//check if the file starts with an EndNote tag, but
			//also make sure the extension is not 'bib' or 'yml'/'yaml'
			//because BibTeX comments and YAML directives look like
			//EndNote tags
			if (firstCharacters[0] == '%' &&
					Character.isWhitespace(firstCharacters[2]) &&
					!ext.equalsIgnoreCase("bib") &&
					!ext.equalsIgnoreCase("yaml") &&
					!ext.equalsIgnoreCase("yml")) {
				return FileFormat.ENDNOTE;
			}
			
			//check if the file starts with a RIS type tag
			if (firstCharacters[0] == 'T' && firstCharacters[1] == 'Y' &&
					Character.isWhitespace(firstCharacters[2]) &&
					Character.isWhitespace(firstCharacters[3]) &&
					firstCharacters[4] == '-') {
				return FileFormat.RIS;
			}
		} finally {
			bis.reset();
		}
		
		//now check if it's json, bibtex or yaml
		bis.mark(len);
		try {
			while (true) {
				int c = bis.read();
				--len;
				if (c < 0 || len < 2) {
					return FileFormat.UNKNOWN;
				}
				
				if (!Character.isWhitespace(c)) {
					if (c == '[') {
						return FileFormat.JSON_ARRAY;
					} else if (c == '{') {
						return FileFormat.JSON_OBJECT;
					}
					if (ext.equalsIgnoreCase("yaml") ||
							ext.equalsIgnoreCase("yml")) {
						return FileFormat.YAML;
					}
					return FileFormat.BIBTEX;
				}
			}
		} finally {
			bis.reset();
		}
	}
}