package org.jabref.logic.importer.fileformat; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PushbackReader; import java.io.Reader; import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.Deque; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; import org.jabref.logic.bibtex.FieldContentParser; import org.jabref.logic.exporter.BibtexDatabaseWriter; import org.jabref.logic.exporter.SavePreferences; import org.jabref.logic.importer.ImportFormatPreferences; import org.jabref.logic.importer.ParseException; import org.jabref.logic.importer.Parser; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.importer.util.MetaDataParser; import org.jabref.logic.l10n.Localization; import org.jabref.model.database.BibDatabase; import org.jabref.model.database.KeyCollisionException; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.BibtexString; import org.jabref.model.entry.CustomEntryType; import org.jabref.model.entry.EntryType; import org.jabref.model.entry.FieldName; import org.jabref.model.entry.FieldProperty; import org.jabref.model.entry.InternalBibtexFields; import org.jabref.model.metadata.MetaData; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Class for importing BibTeX-files. * <p> * Use: * <p> * BibtexParser parser = new BibtexParser(reader); * <p> * ParserResult result = parser.parse(); * <p> * or * <p> * ParserResult result = BibtexParser.parse(reader); * <p> * Can be used stand-alone. */ public class BibtexParser implements Parser { private static final Log LOGGER = LogFactory.getLog(BibtexParser.class); private static final Integer LOOKAHEAD = 64; private final FieldContentParser fieldContentParser; private final Deque<Character> pureTextFromFile = new LinkedList<>(); private final ImportFormatPreferences importFormatPreferences; private PushbackReader pushbackReader; private BibDatabase database; private Map<String, EntryType> entryTypes; private boolean eof; private int line = 1; private ParserResult parserResult; public BibtexParser(ImportFormatPreferences importFormatPreferences) { this.importFormatPreferences = Objects.requireNonNull(importFormatPreferences); fieldContentParser = new FieldContentParser(importFormatPreferences.getFieldContentParserPreferences()); } /** * Shortcut usage to create a Parser and read the input. * * @param in the Reader to read from * @throws IOException * @deprecated inline this method */ @Deprecated public static ParserResult parse(Reader in, ImportFormatPreferences importFormatPreferences) throws IOException { return new BibtexParser(importFormatPreferences).parse(in); } /** * Parses BibtexEntries from the given string and returns one entry found (or null if none found) * <p> * It is undetermined which entry is returned, so use this in case you know there is only one entry in the string. * * @param bibtexString * @return An Optional<BibEntry>. Optional.empty() if non was found or an error occurred. * @throws ParseException */ public static Optional<BibEntry> singleFromString(String bibtexString, ImportFormatPreferences importFormatPreferences) throws ParseException { Collection<BibEntry> entries = new BibtexParser(importFormatPreferences).parseEntries(bibtexString); if ((entries == null) || entries.isEmpty()) { return Optional.empty(); } return Optional.of(entries.iterator().next()); } @Override public List<BibEntry> parseEntries(InputStream inputStream) throws ParseException { BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); return parseEntries(reader); } public List<BibEntry> parseEntries(Reader reader) throws ParseException { try { return parse(reader).getDatabase().getEntries(); } catch (IOException e) { throw new ParseException(e); } } public List<BibEntry> parseEntries(String bibtexString) throws ParseException { return parseEntries(new StringReader(bibtexString)); } /** * Will parse the BibTex-Data found when reading from reader. Ignores any encoding supplied in the file by * "Encoding: myEncoding". * <p> * The reader will be consumed. * <p> * Multiple calls to parse() return the same results * * @return ParserResult * @throws IOException */ public ParserResult parse(Reader in) throws IOException { Objects.requireNonNull(in); pushbackReader = new PushbackReader(in, BibtexParser.LOOKAHEAD); // Bibtex related contents. initializeParserResult(); parseDatabaseID(); skipWhitespace(); try { return parseFileContent(); } catch (KeyCollisionException kce) { throw new IOException("Duplicate ID in bibtex file: " + kce); } } private void initializeParserResult() { database = new BibDatabase(); entryTypes = new HashMap<>(); // To store custom entry types parsed. parserResult = new ParserResult(database, new MetaData(), entryTypes); } private void parseDatabaseID() throws IOException { while (!eof) { skipWhitespace(); char c = (char) read(); if (c == '%') { skipWhitespace(); String label = parseTextToken().trim(); if (label.equals(BibtexDatabaseWriter.DATABASE_ID_PREFIX)) { skipWhitespace(); database.setSharedDatabaseID(parseTextToken().trim()); } } else if (c == '@') { unread(c); break; } } } private ParserResult parseFileContent() throws IOException { Map<String, String> meta = new HashMap<>(); while (!eof) { boolean found = consumeUncritically('@'); if (!found) { break; } skipWhitespace(); // Try to read the entry type String entryType = parseTextToken().toLowerCase(Locale.ROOT).trim(); if ("preamble".equals(entryType)) { database.setPreamble(parsePreamble()); // Consume new line which signals end of preamble skipOneNewline(); // the preamble is saved verbatim anyways, so the text read so far can be dropped dumpTextReadSoFarToString(); } else if ("string".equals(entryType)) { parseBibtexString(); } else if ("comment".equals(entryType)) { parseJabRefComment(meta); } else { // Not a comment, preamble, or string. Thus, it is an entry parseAndAddEntry(entryType); } skipWhitespace(); } // Instantiate meta data: try { parserResult.setMetaData(MetaDataParser.parse(meta, importFormatPreferences.getKeywordSeparator())); } catch (ParseException exception) { parserResult.addException(exception); } parseRemainingContent(); return parserResult; } private void parseRemainingContent() { database.setEpilog(dumpTextReadSoFarToString().trim()); } private void parseAndAddEntry(String type) { /** * Morten Alver 13 Aug 2006: Trying to make the parser more * robust. If an exception is thrown when parsing an entry, * drop the entry and try to resume parsing. Add a warning * for the user. */ try { // collect all comments and the entry type definition in front of the actual entry // this is at least `@Type` String commentsAndEntryTypeDefinition = dumpTextReadSoFarToString(); BibEntry entry = parseEntry(type); // store comments collected without type definition entry.setCommentsBeforeEntry( commentsAndEntryTypeDefinition.substring(0, commentsAndEntryTypeDefinition.lastIndexOf('@'))); // store complete parsed serialization (comments, type definition + type contents) entry.setParsedSerialization(commentsAndEntryTypeDefinition + dumpTextReadSoFarToString()); boolean duplicateKey = database.insertEntry(entry); if (duplicateKey) { parserResult.addDuplicateKey(entry.getCiteKey()); } } catch (IOException ex) { LOGGER.debug("Could not parse entry", ex); parserResult.addWarning(Localization.lang("Error occurred when parsing entry") + ": '" + ex.getMessage() + "'. " + Localization.lang("Skipped entry.")); } } private void parseJabRefComment(Map<String, String> meta) { StringBuilder buffer = null; try { buffer = parseBracketedTextExactly(); } catch (IOException e) { /* if we get an IO Exception here, than we have an unbracketed comment, * which means that we should just return and the comment will be picked up as arbitrary text * by the parser */ LOGGER.info("Found unbracketed comment"); return; } String comment = buffer.toString().replaceAll("[\\x0d\\x0a]", ""); if (comment.substring(0, Math.min(comment.length(), MetaData.META_FLAG.length())).equals(MetaData.META_FLAG)) { if (comment.substring(0, MetaData.META_FLAG.length()).equals(MetaData.META_FLAG)) { String rest = comment.substring(MetaData.META_FLAG.length()); int pos = rest.indexOf(':'); if (pos > 0) { // We remove all line breaks in the metadata - these // will have been inserted // to prevent too long lines when the file was // saved, and are not part of the data. meta.put(rest.substring(0, pos), rest.substring(pos + 1)); // meta comments are always re-written by JabRef and not stored in the file dumpTextReadSoFarToString(); } } } else if (comment.substring(0, Math.min(comment.length(), CustomEntryType.ENTRYTYPE_FLAG.length())) .equals(CustomEntryType.ENTRYTYPE_FLAG)) { // A custom entry type can also be stored in a // "@comment" Optional<CustomEntryType> typ = CustomEntryType.parse(comment); if (typ.isPresent()) { entryTypes.put(typ.get().getName(), typ.get()); } else { parserResult.addWarning(Localization.lang("Ill-formed entrytype comment in BIB file") + ": " + comment); } // custom entry types are always re-written by JabRef and not stored in the file dumpTextReadSoFarToString(); } } private void parseBibtexString() throws IOException { BibtexString bibtexString = parseString(); bibtexString.setParsedSerialization(dumpTextReadSoFarToString()); try { database.addString(bibtexString); } catch (KeyCollisionException ex) { parserResult.addWarning(Localization.lang("Duplicate string name") + ": " + bibtexString.getName()); } } /** * Puts all text that has been read from the reader, including newlines, etc., since the last call of this method into a string. * Removes the JabRef file header, if it is found * * @return the text read so far */ private String dumpTextReadSoFarToString() { String result = getPureTextFromFile(); int indexOfAt = result.indexOf("@"); // if there is no entry found, simply return the content (necessary to parse text remaining after the last entry) if (indexOfAt == -1) { return purgeEOFCharacters(result); } else if (result.contains(BibtexDatabaseWriter.DATABASE_ID_PREFIX)) { return purge(result, BibtexDatabaseWriter.DATABASE_ID_PREFIX); } else if (result.contains(SavePreferences.ENCODING_PREFIX)) { return purge(result, SavePreferences.ENCODING_PREFIX); } else { return result; } } private String purge(String context, String stringToPurge) { // purge the encoding line if it exists int runningIndex = context.indexOf(stringToPurge); int indexOfAt = context.indexOf("@"); while (runningIndex < indexOfAt) { if (context.charAt(runningIndex) == '\n') { break; } else if (context.charAt(runningIndex) == '\r') { if (context.charAt(runningIndex + 1) == '\n') { runningIndex++; } break; } runningIndex++; } return context.substring(runningIndex + 1); } private String getPureTextFromFile() { StringBuilder entry = new StringBuilder(); while (!pureTextFromFile.isEmpty()) { entry.append(pureTextFromFile.pollFirst()); } return entry.toString(); } /** * Removes all eof characters from a StringBuilder and returns a new String with the resulting content * * @return a String without eof characters */ private String purgeEOFCharacters(String input) { StringBuilder remainingText = new StringBuilder(); for (Character character : input.toCharArray()) { if (!(isEOFCharacter(character))) { remainingText.append(character); } } return remainingText.toString(); } private void skipWhitespace() throws IOException { int character; while (true) { character = read(); if (isEOFCharacter(character)) { eof = true; return; } if (!Character.isWhitespace((char) character)) { // found non-whitespace char unread(character); break; } } } private void skipSpace() throws IOException { int character; while (true) { character = read(); if (isEOFCharacter(character)) { eof = true; return; } if ((char) character != ' ') { // found non-space char unread(character); break; } } } private void skipOneNewline() throws IOException { skipSpace(); if (peek() == '\r') { read(); } if (peek() == '\n') { read(); } } private boolean isEOFCharacter(int character) { return (character == -1) || (character == 65535); } private String skipAndRecordWhitespace(int character) throws IOException { StringBuilder stringBuilder = new StringBuilder(); if (character != ' ') { stringBuilder.append((char) character); } while (true) { int nextCharacter = read(); if (isEOFCharacter(nextCharacter)) { eof = true; return stringBuilder.toString(); } if (Character.isWhitespace((char) nextCharacter)) { if (nextCharacter != ' ') { stringBuilder.append((char) nextCharacter); } } else { // found non-whitespace char unread(nextCharacter); break; } } return stringBuilder.toString(); } private int peek() throws IOException { int character = read(); unread(character); return character; } private int read() throws IOException { int character = pushbackReader.read(); if (!isEOFCharacter(character)) { pureTextFromFile.offerLast((char) character); } if (character == '\n') { line++; } return character; } private void unread(int character) throws IOException { if (character == '\n') { line--; } pushbackReader.unread(character); if (pureTextFromFile.getLast() == character) { pureTextFromFile.pollLast(); } } private BibtexString parseString() throws IOException { skipWhitespace(); consume('{', '('); skipWhitespace(); LOGGER.debug("Parsing string name"); String name = parseTextToken(); LOGGER.debug("Parsed string name"); skipWhitespace(); LOGGER.debug("Now the contents"); consume('='); String content = parseFieldContent(name); LOGGER.debug("Now I'm going to consume a }"); consume('}', ')'); // Consume new line which signals end of entry skipOneNewline(); LOGGER.debug("Finished string parsing."); return new BibtexString(name, content); } private String parsePreamble() throws IOException { skipWhitespace(); return parseBracketedText().toString(); } private BibEntry parseEntry(String entryType) throws IOException { BibEntry result = new BibEntry(entryType); skipWhitespace(); consume('{', '('); int character = peek(); if ((character != '\n') && (character != '\r')) { skipWhitespace(); } String key = parseKey(); result.setCiteKey(key); skipWhitespace(); while (true) { character = peek(); if ((character == '}') || (character == ')')) { break; } if (character == ',') { consume(','); } skipWhitespace(); character = peek(); if ((character == '}') || (character == ')')) { break; } parseField(result); } consume('}', ')'); // Consume new line which signals end of entry skipOneNewline(); return result; } private void parseField(BibEntry entry) throws IOException { String key = parseTextToken().toLowerCase(Locale.ROOT); skipWhitespace(); consume('='); String content = parseFieldContent(key); if (!content.isEmpty()) { if (entry.hasField(key)) { // The following hack enables the parser to deal with multiple // author or // editor lines, stringing them together instead of getting just // one of them. // Multiple author or editor lines are not allowed by the bibtex // format, but // at least one online database exports bibtex like that, making // it inconvenient // for users if JabRef did not accept it. if (InternalBibtexFields.getFieldProperties(key).contains(FieldProperty.PERSON_NAMES)) { entry.setField(key, entry.getField(key).get() + " and " + content); } else if (FieldName.KEYWORDS.equals(key)) { //multiple keywords fields should be combined to one entry.addKeyword(content, importFormatPreferences.getKeywordSeparator()); } } else { entry.setField(key, content); } } } private String parseFieldContent(String key) throws IOException { skipWhitespace(); StringBuilder value = new StringBuilder(); int character; while (((character = peek()) != ',') && (character != '}') && (character != ')')) { if (eof) { throw new IOException("Error in line " + line + ": EOF in mid-string"); } if (character == '"') { StringBuilder text = parseQuotedFieldExactly(); value.append(fieldContentParser.format(text, key)); } else if (character == '{') { // Value is a string enclosed in brackets. There can be pairs // of brackets inside of a field, so we need to count the // brackets to know when the string is finished. StringBuilder text = parseBracketedTextExactly(); value.append(fieldContentParser.format(text, key)); } else if (Character.isDigit((char) character)) { // value is a number String number = parseTextToken(); value.append(number); } else if (character == '#') { consume('#'); } else { String textToken = parseTextToken(); if (textToken.isEmpty()) { throw new IOException("Error in line " + line + " or above: " + "Empty text token.\nThis could be caused " + "by a missing comma between two fields."); } value.append('#').append(textToken).append('#'); } skipWhitespace(); } return value.toString(); } /** * This method is used to parse string labels, field names, entry type and * numbers outside brackets. */ private String parseTextToken() throws IOException { StringBuilder token = new StringBuilder(20); while (true) { int character = read(); if (character == -1) { eof = true; return token.toString(); } if (Character.isLetterOrDigit((char) character) || (":-_*+./'".indexOf(character) >= 0)) { token.append((char) character); } else { unread(character); return token.toString(); } } } /** * Tries to restore the key * * @return rest of key on success, otherwise empty string * @throws IOException on Reader-Error */ private String fixKey() throws IOException { StringBuilder key = new StringBuilder(); int lookaheadUsed = 0; char currentChar; // Find a char which ends key (','&&'\n') or entryfield ('='): do { currentChar = (char) read(); key.append(currentChar); lookaheadUsed++; } while ((currentChar != ',') && (currentChar != '\n') && (currentChar != '=') && (lookaheadUsed < BibtexParser.LOOKAHEAD)); // Consumed a char too much, back into reader and remove from key: unread(currentChar); key.deleteCharAt(key.length() - 1); // Restore if possible: switch (currentChar) { case '=': // Get entryfieldname, push it back and take rest as key key = key.reverse(); boolean matchedAlpha = false; for (int i = 0; i < key.length(); i++) { currentChar = key.charAt(i); /// Skip spaces: if (!matchedAlpha && (currentChar == ' ')) { continue; } matchedAlpha = true; // Begin of entryfieldname (e.g. author) -> push back: unread(currentChar); if ((currentChar == ' ') || (currentChar == '\n')) { /* * found whitespaces, entryfieldname completed -> key in * keybuffer, skip whitespaces */ StringBuilder newKey = new StringBuilder(); for (int j = i; j < key.length(); j++) { currentChar = key.charAt(j); if (!Character.isWhitespace(currentChar)) { newKey.append(currentChar); } } // Finished, now reverse newKey and remove whitespaces: parserResult.addWarning( Localization.lang("Line %0: Found corrupted BibTeX key.", String.valueOf(line))); key = newKey.reverse(); } } break; case ',': parserResult.addWarning(Localization.lang("Line %0: Found corrupted BibTeX key (contains whitespaces).", String.valueOf(line))); break; case '\n': parserResult.addWarning( Localization.lang("Line %0: Found corrupted BibTeX key (comma missing).", String.valueOf(line))); break; default: // No more lookahead, give up: unreadBuffer(key); return ""; } return removeWhitespaces(key).toString(); } /** * returns a new <code>StringBuilder</code> which corresponds to <code>toRemove</code> without whitespaces * * @param toRemove * @return */ private StringBuilder removeWhitespaces(StringBuilder toRemove) { StringBuilder result = new StringBuilder(); char current; for (int i = 0; i < toRemove.length(); ++i) { current = toRemove.charAt(i); if (!Character.isWhitespace(current)) { result.append(current); } } return result; } /** * pushes buffer back into input * * @param stringBuilder * @throws IOException can be thrown if buffer is bigger than LOOKAHEAD */ private void unreadBuffer(StringBuilder stringBuilder) throws IOException { for (int i = stringBuilder.length() - 1; i >= 0; --i) { unread(stringBuilder.charAt(i)); } } /** * This method is used to parse the bibtex key for an entry. */ private String parseKey() throws IOException { StringBuilder token = new StringBuilder(20); while (true) { int character = read(); if (character == -1) { eof = true; return token.toString(); } if (!Character.isWhitespace((char) character) && (Character.isLetterOrDigit((char) character) || (character == ':') || ("#{}~,=\uFFFD".indexOf(character) == -1))) { token.append((char) character); } else { if (Character.isWhitespace((char) character)) { // We have encountered white space instead of the comma at // the end of // the key. Possibly the comma is missing, so we try to // return what we // have found, as the key and try to restore the rest in fixKey(). return token + fixKey(); } else if ((character == ',') || (character == '}')) { unread(character); return token.toString(); } else if (character == '=') { // If we find a '=' sign, it is either an error, or // the entry lacked a comma signifying the end of the key. return token.toString(); } else { throw new IOException("Error in line " + line + ":" + "Character '" + (char) character + "' is not " + "allowed in bibtex keys."); } } } } private StringBuffer parseBracketedText() throws IOException { StringBuffer value = new StringBuffer(); consume('{', '('); int brackets = 0; while (!((isClosingBracketNext()) && (brackets == 0))) { int character = read(); if (isEOFCharacter(character)) { throw new IOException("Error in line " + line + ": EOF in mid-string"); } else if ((character == '{') || (character == '(')) { brackets++; } else if ((character == '}') || (character == ')')) { brackets--; } // If we encounter whitespace of any kind, read it as a // simple space, and ignore any others that follow immediately. /* * if (j == '\n') { if (peek() == '\n') value.append('\n'); } else */ if (Character.isWhitespace((char) character)) { String whitespacesReduced = skipAndRecordWhitespace(character); if (!(whitespacesReduced.isEmpty()) && !"\n\t".equals(whitespacesReduced)) { // && whitespacesReduced = whitespacesReduced.replace("\t", ""); // Remove tabulators. value.append(whitespacesReduced); } else { value.append(' '); } } else { value.append((char) character); } } consume('}', ')'); return value; } private boolean isClosingBracketNext() { try { int peek = peek(); boolean isCurlyBracket = peek == '}'; boolean isRoundBracket = peek == ')'; return isCurlyBracket || isRoundBracket; } catch (IOException e) { return false; } } private StringBuilder parseBracketedTextExactly() throws IOException { StringBuilder value = new StringBuilder(); consume('{'); int brackets = 0; char character; char lastCharacter = '\0'; while (true) { character = (char) read(); boolean isClosingBracket = (character == '}') && (lastCharacter != '\\'); if (isClosingBracket && (brackets == 0)) { return value; } else if (isEOFCharacter(character)) { throw new IOException("Error in line " + line + ": EOF in mid-string"); } else if ((character == '{') && (!isEscapeSymbol(lastCharacter))) { brackets++; } else if (isClosingBracket) { brackets--; } value.append(character); lastCharacter = character; } } private boolean isEscapeSymbol(char character) { return '\\' == character; } private StringBuilder parseQuotedFieldExactly() throws IOException { StringBuilder value = new StringBuilder(); consume('"'); int brackets = 0; while (!((peek() == '"') && (brackets == 0))) { int j = read(); if (isEOFCharacter(j)) { throw new IOException("Error in line " + line + ": EOF in mid-string"); } else if (j == '{') { brackets++; } else if (j == '}') { brackets--; } value.append((char) j); } consume('"'); return value; } private void consume(char expected) throws IOException { int character = read(); if (character != expected) { throw new IOException( "Error in line " + line + ": Expected " + expected + " but received " + (char) character); } } private boolean consumeUncritically(char expected) throws IOException { int character; do { character = read(); } while ((character != expected) && (character != -1) && (character != 65535)); if (isEOFCharacter(character)) { eof = true; } // Return true if we actually found the character we were looking for: return character == expected; } private void consume(char firstOption, char secondOption) throws IOException { // Consumes one of the two, doesn't care which appears. int character = read(); if ((character != firstOption) && (character != secondOption)) { throw new IOException("Error in line " + line + ": Expected " + firstOption + " or " + secondOption + " but received " + (char) character); } } }