package org.jabref.logic.importer.fileformat; import java.io.BufferedReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jabref.logic.importer.Importer; import org.jabref.logic.importer.ParserResult; import org.jabref.logic.util.FileExtensions; import org.jabref.model.entry.AuthorList; import org.jabref.model.entry.BibEntry; import org.jabref.model.entry.FieldName; /** * Imports an Ovid file. */ public class OvidImporter extends Importer { private static final Pattern OVID_SOURCE_PATTERN = Pattern .compile("Source ([ \\w&\\-,:]+)\\.[ ]+([0-9]+)\\(([\\w\\-]+)\\):([0-9]+\\-?[0-9]+?)\\,.*([0-9][0-9][0-9][0-9])"); private static final Pattern OVID_SOURCE_PATTERN_NO_ISSUE = Pattern .compile("Source ([ \\w&\\-,:]+)\\.[ ]+([0-9]+):([0-9]+\\-?[0-9]+?)\\,.*([0-9][0-9][0-9][0-9])"); private static final Pattern OVID_SOURCE_PATTERN_2 = Pattern.compile( "([ \\w&\\-,]+)\\. Vol ([0-9]+)\\(([\\w\\-]+)\\) ([A-Za-z]+) ([0-9][0-9][0-9][0-9]), ([0-9]+\\-?[0-9]+)"); private static final Pattern INCOLLECTION_PATTERN = Pattern.compile( "(.+)\\(([0-9][0-9][0-9][0-9])\\)\\. ([ \\w&\\-,:]+)\\.[ ]+\\(pp. ([0-9]+\\-?[0-9]+?)\\).[A-Za-z0-9, ]+pp\\. " + "([\\w, ]+): ([\\w, ]+)"); private static final Pattern BOOK_PATTERN = Pattern.compile( "\\(([0-9][0-9][0-9][0-9])\\)\\. [A-Za-z, ]+([0-9]+) pp\\. ([\\w, ]+): ([\\w, ]+)"); private static final String OVID_PATTERN_STRING = "<[0-9]+>"; private static final Pattern OVID_PATTERN = Pattern.compile(OVID_PATTERN_STRING); private static final int MAX_ITEMS = 50; @Override public String getName() { return "Ovid"; } @Override public FileExtensions getExtensions() { return FileExtensions.OVID; } @Override public String getDescription() { return "Imports an Ovid file."; } @Override public boolean isRecognizedFormat(BufferedReader reader) throws IOException { String str; int i = 0; while (((str = reader.readLine()) != null) && (i < MAX_ITEMS)) { if (OvidImporter.OVID_PATTERN.matcher(str).find()) { return true; } i++; } return false; } @Override public ParserResult importDatabase(BufferedReader reader) throws IOException { List<BibEntry> bibitems = new ArrayList<>(); StringBuilder sb = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { if (!line.isEmpty() && (line.charAt(0) != ' ')) { sb.append("__NEWFIELD__"); } sb.append(line); sb.append('\n'); } String[] items = sb.toString().split(OVID_PATTERN_STRING); for (int i = 1; i < items.length; i++) { Map<String, String> h = new HashMap<>(); String[] fields = items[i].split("__NEWFIELD__"); for (String field : fields) { int linebreak = field.indexOf('\n'); String fieldName = field.substring(0, linebreak).trim(); String content = field.substring(linebreak).trim(); // Check if this is the author field (due to a minor special treatment for this field): boolean isAuthor = (fieldName.indexOf("Author") == 0) && !fieldName.contains("Author Keywords") && !fieldName.contains("Author e-mail"); // Remove unnecessary dots at the end of lines, unless this is the author field, // in which case a dot at the end could be significant: if (!isAuthor && content.endsWith(".")) { content = content.substring(0, content.length() - 1); } if (isAuthor) { h.put(FieldName.AUTHOR, content); } else if (fieldName.startsWith("Title")) { content = content.replaceAll("\\[.+\\]", "").trim(); if (content.endsWith(".")) { content = content.substring(0, content.length() - 1); } h.put(FieldName.TITLE, content); } else if (fieldName.startsWith("Chapter Title")) { h.put("chaptertitle", content); } else if (fieldName.startsWith("Source")) { Matcher matcher; if ((matcher = OvidImporter.OVID_SOURCE_PATTERN.matcher(content)).find()) { h.put(FieldName.JOURNAL, matcher.group(1)); h.put(FieldName.VOLUME, matcher.group(2)); h.put(FieldName.ISSUE, matcher.group(3)); h.put(FieldName.PAGES, matcher.group(4)); h.put(FieldName.YEAR, matcher.group(5)); } else if ((matcher = OvidImporter.OVID_SOURCE_PATTERN_NO_ISSUE.matcher(content)).find()) { // may be missing the issue h.put(FieldName.JOURNAL, matcher.group(1)); h.put(FieldName.VOLUME, matcher.group(2)); h.put(FieldName.PAGES, matcher.group(3)); h.put(FieldName.YEAR, matcher.group(4)); } else if ((matcher = OvidImporter.OVID_SOURCE_PATTERN_2.matcher(content)).find()) { h.put(FieldName.JOURNAL, matcher.group(1)); h.put(FieldName.VOLUME, matcher.group(2)); h.put(FieldName.ISSUE, matcher.group(3)); h.put(FieldName.MONTH, matcher.group(4)); h.put(FieldName.YEAR, matcher.group(5)); h.put(FieldName.PAGES, matcher.group(6)); } else if ((matcher = OvidImporter.INCOLLECTION_PATTERN.matcher(content)).find()) { h.put(FieldName.EDITOR, matcher.group(1).replace(" (Ed)", "")); h.put(FieldName.YEAR, matcher.group(2)); h.put(FieldName.BOOKTITLE, matcher.group(3)); h.put(FieldName.PAGES, matcher.group(4)); h.put(FieldName.ADDRESS, matcher.group(5)); h.put(FieldName.PUBLISHER, matcher.group(6)); } else if ((matcher = OvidImporter.BOOK_PATTERN.matcher(content)).find()) { h.put(FieldName.YEAR, matcher.group(1)); h.put(FieldName.PAGES, matcher.group(2)); h.put(FieldName.ADDRESS, matcher.group(3)); h.put(FieldName.PUBLISHER, matcher.group(4)); } // Add double hyphens to page ranges: if (h.get(FieldName.PAGES) != null) { h.put(FieldName.PAGES, h.get(FieldName.PAGES).replace("-", "--")); } } else if ("Abstract".equals(fieldName)) { h.put(FieldName.ABSTRACT, content); } else if ("Publication Type".equals(fieldName)) { if (content.contains("Book")) { h.put(BibEntry.TYPE_HEADER, "book"); } else if (content.contains("Journal")) { h.put(BibEntry.TYPE_HEADER, "article"); } else if (content.contains("Conference Paper")) { h.put(BibEntry.TYPE_HEADER, "inproceedings"); } } else if (fieldName.startsWith("Language")) { h.put(FieldName.LANGUAGE, content); } else if (fieldName.startsWith("Author Keywords")) { content = content.replace(";", ",").replace(" ", " "); h.put(FieldName.KEYWORDS, content); } else if (fieldName.startsWith("ISSN")) { h.put(FieldName.ISSN, content); } else if (fieldName.startsWith("DOI Number")) { h.put(FieldName.DOI, content); } } // Now we need to check if a book entry has given editors in the author field; // if so, rearrange: String auth = h.get(FieldName.AUTHOR); if ((auth != null) && auth.contains(" [Ed]")) { h.remove(FieldName.AUTHOR); h.put(FieldName.EDITOR, auth.replace(" [Ed]", "")); } // Rearrange names properly: auth = h.get(FieldName.AUTHOR); if (auth != null) { h.put(FieldName.AUTHOR, fixNames(auth)); } auth = h.get(FieldName.EDITOR); if (auth != null) { h.put(FieldName.EDITOR, fixNames(auth)); } // Set the entrytype properly: String entryType = h.containsKey(BibEntry.TYPE_HEADER) ? h.get(BibEntry.TYPE_HEADER) : BibEntry.DEFAULT_TYPE; h.remove(BibEntry.TYPE_HEADER); if ("book".equals(entryType) && h.containsKey("chaptertitle")) { // This means we have an "incollection" entry. entryType = "incollection"; // Move the "chaptertitle" to just "title": h.put(FieldName.TITLE, h.remove("chaptertitle")); } BibEntry b = new BibEntry(entryType); b.setField(h); bibitems.add(b); } return new ParserResult(bibitems); } /** * Convert a string of author names into a BibTeX-compatible format. * @param content The name string. * @return The formatted names. */ private static String fixNames(String content) { String names; if (content.indexOf(';') > 0) { //LN FN; [LN FN;]* names = content.replaceAll("[^\\.A-Za-z,;\\- ]", "").replace(";", " and"); } else if (content.indexOf(" ") > 0) { String[] sNames = content.split(" "); StringBuilder sb = new StringBuilder(); for (int i = 0; i < sNames.length; i++) { if (i > 0) { sb.append(" and "); } sb.append(sNames[i].replaceFirst(" ", ", ")); } names = sb.toString(); } else { names = content; } return AuthorList.fixAuthorLastNameFirst(names); } }