IsiImporter.java example

Explorer
jabref-master
- src
package org.jabref.logic.importer.fileformat;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.jabref.logic.formatter.casechanger.TitleCaseFormatter;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.util.FileExtensions;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.Month;

/**
 * Importer for the ISI Web of Science, INSPEC and Medline format.
 * <p>
 * Documentation about ISI WOS format:
 * <p>
 * <ul>
 * <li>http://wos.isitrial.com/help/helpprn.html</li>
 * </ul>
 * <p>
 * <ul>
 * <li>Check compatibility with other ISI2Bib tools like:
 * http://www-lab.imr.tohoku.ac.jp/~t-nissie/computer/software/isi/ or
 * http://www.tug.org/tex-archive/biblio/bibtex/utils/isi2bibtex/isi2bibtex or
 * http://web.mit.edu/emilio/www/utils.html</li>
 * <li>Deal with capitalization correctly</li>
 * </ul>
 */
public class IsiImporter extends Importer {

    private static final Pattern SUB_SUP_PATTERN = Pattern.compile("/(sub|sup)\\s+(.*?)\\s*/");

    // 2006.09.05: Modified pattern to avoid false positives for other files due to an
    // extra | at the end:
    private static final Pattern ISI_PATTERN = Pattern.compile("FN ISI Export Format|VR 1.|PY \\d{4}");

    @Override
    public String getName() {
        return "ISI";
    }

    @Override
    public FileExtensions getExtensions() {
        return FileExtensions.ISI;
    }

    @Override
    public String getId() {
        return "isi";
    }

    @Override
    public String getDescription() {
        return "Importer for the ISI Web of Science, INSPEC and Medline format.";
    }

    @Override
    public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
        String str;
        int i = 0;
        while (((str = reader.readLine()) != null) && (i < 50)) {

            /**
             * The following line gives false positives for RIS files, so it
             * should not be uncommented. The hypen is a characteristic of the
             * RIS format.
             *
             * str = str.replace(" - ", "")
             */
            if (IsiImporter.ISI_PATTERN.matcher(str).find()) {
                return true;
            }

            i++;
        }
        return false;
    }

    public static void processSubSup(Map<String, String> map) {

        String[] subsup = {FieldName.TITLE, FieldName.ABSTRACT, FieldName.REVIEW, "notes"};

        for (String aSubsup : subsup) {
            if (map.containsKey(aSubsup)) {

                Matcher m = IsiImporter.SUB_SUP_PATTERN.matcher(map.get(aSubsup));
                StringBuffer sb = new StringBuffer();

                while (m.find()) {

                    String group2 = m.group(2);
                    group2 = group2.replaceAll("\\$", "\\\\\\\\\\\\\\$"); // Escaping
                    // insanity!
                    // :-)
                    if (group2.length() > 1) {
                        group2 = "{" + group2 + "}";
                    }
                    if ("sub".equals(m.group(1))) {
                        m.appendReplacement(sb, "\\$_" + group2 + "\\$");
                    } else {
                        m.appendReplacement(sb, "\\$^" + group2 + "\\$");
                    }
                }
                m.appendTail(sb);
                map.put(aSubsup, sb.toString());
            }
        }
    }

    private static void processCapitalization(Map<String, String> map) {

        String[] subsup = {FieldName.TITLE, FieldName.JOURNAL, FieldName.PUBLISHER};

        for (String aSubsup : subsup) {

            if (map.containsKey(aSubsup)) {

                String s = map.get(aSubsup);
                if (s.toUpperCase(Locale.ROOT).equals(s)) {
                    s = new TitleCaseFormatter().format(s);
                    map.put(aSubsup, s);
                }
            }
        }
    }

    @Override
    public ParserResult importDatabase(BufferedReader reader) throws IOException {
        Objects.requireNonNull(reader);

        List<BibEntry> bibitems = new ArrayList<>();
        StringBuilder sb = new StringBuilder();

        // Pattern fieldPattern = Pattern.compile("^AU |^TI |^SO |^DT |^C1 |^AB
        // |^ID |^BP |^PY |^SE |^PY |^VL |^IS ");
        String str;

        while ((str = reader.readLine()) != null) {
            if (str.length() < 3) {
                continue;
            }

            // beginning of a new item
            if ("PT ".equals(str.substring(0, 3))) {
                sb.append("::").append(str);
            } else {
                String beg = str.substring(0, 3).trim();

                // I could have used the fieldPattern regular expression instead
                // however this seems to be
                // quick and dirty and it works!
                if (beg.length() == 2) {
                    sb.append(" ## "); // mark the beginning of each field
                    sb.append(str);
                } else {
                    sb.append("EOLEOL"); // mark the end of each line
                    sb.append(str.trim()); // remove the initial spaces
                }
            }
        }

        String[] entries = sb.toString().split("::");

        Map<String, String> hm = new HashMap<>();

        // skip the first entry as it is either empty or has document header
        for (String entry : entries) {
            String[] fields = entry.split(" ## ");

            if (fields.length == 0) {
                fields = entry.split("\n");
            }

            String Type = "";
            String PT = "";
            String pages = "";
            hm.clear();

            for (String field : fields) {
                // empty field don't do anything
                if (field.length() <= 2) {
                    continue;
                }

                String beg = field.substring(0, 2);
                String value = field.substring(3);
                if (value.startsWith(" - ")) {
                    value = value.substring(3);
                }
                value = value.trim();

                if ("PT".equals(beg)) {
                    if (value.startsWith("J")) {
                        PT = "article";
                    } else {
                        PT = value;
                    }
                    Type = "article"; // make all of them PT?
                } else if ("TY".equals(beg)) {
                    if ("JOUR".equals(value)) {
                        Type = "article";
                    } else if ("CONF".equals(value)) {
                        Type = "inproceedings";
                    }
                } else if ("JO".equals(beg)) {
                    hm.put(FieldName.BOOKTITLE, value);
                } else if ("AU".equals(beg)) {
                    String author = IsiImporter.isiAuthorsConvert(value.replace("EOLEOL", " and "));

                    // if there is already someone there then append with "and"
                    if (hm.get(FieldName.AUTHOR) != null) {
                        author = hm.get(FieldName.AUTHOR) + " and " + author;
                    }

                    hm.put(FieldName.AUTHOR, author);
                } else if ("TI".equals(beg)) {
                    hm.put(FieldName.TITLE, value.replace("EOLEOL", " "));
                } else if ("SO".equals(beg) || "JA".equals(beg)) {
                    hm.put(FieldName.JOURNAL, value.replace("EOLEOL", " "));
                } else if ("ID".equals(beg) || "KW".equals(beg)) {

                    value = value.replace("EOLEOL", " ");
                    String existingKeywords = hm.get(FieldName.KEYWORDS);
                    if ((existingKeywords == null) || existingKeywords.contains(value)) {
                        existingKeywords = value;
                    } else {
                        existingKeywords += ", " + value;
                    }
                    hm.put(FieldName.KEYWORDS, existingKeywords);

                } else if ("AB".equals(beg)) {
                    hm.put(FieldName.ABSTRACT, value.replace("EOLEOL", " "));
                } else if ("BP".equals(beg) || "BR".equals(beg) || "SP".equals(beg)) {
                    pages = value;
                } else if ("EP".equals(beg)) {
                    int detpos = value.indexOf(' ');

                    // tweak for IEEE Explore
                    if ((detpos != -1) && !value.substring(0, detpos).trim().isEmpty()) {
                        value = value.substring(0, detpos);
                    }

                    pages = pages + "--" + value;
                } else if ("PS".equals(beg)) {
                    pages = IsiImporter.parsePages(value);
                } else if ("AR".equals(beg)) {
                    pages = value;
                } else if ("IS".equals(beg)) {
                    hm.put(FieldName.NUMBER, value);
                } else if ("PY".equals(beg)) {
                    hm.put(FieldName.YEAR, value);
                } else if ("VL".equals(beg)) {
                    hm.put(FieldName.VOLUME, value);
                } else if ("PU".equals(beg)) {
                    hm.put(FieldName.PUBLISHER, value);
                } else if ("DI".equals(beg)) {
                    hm.put(FieldName.DOI, value);
                } else if ("PD".equals(beg)) {

                    String month = IsiImporter.parseMonth(value);
                    if (month != null) {
                        hm.put(FieldName.MONTH, month);
                    }

                } else if ("DT".equals(beg)) {
                    Type = value;
                    if ("Review".equals(Type)) {
                        Type = "article"; // set "Review" in Note/Comment?
                    } else if (Type.startsWith("Article") || Type.startsWith("Journal") || "article".equals(PT)) {
                        Type = "article";
                    } else {
                        Type = BibEntry.DEFAULT_TYPE;
                    }
                } else if ("CR".equals(beg)) {
                    hm.put("CitedReferences", value.replace("EOLEOL", " ; ").trim());
                } else {
                    // Preserve all other entries except
                    if ("ER".equals(beg) || "EF".equals(beg) || "VR".equals(beg) || "FN".equals(beg)) {
                        continue;
                    }
                    hm.put(beg.toLowerCase(Locale.ROOT), value);
                }
            }

            if (!"".equals(pages)) {
                hm.put(FieldName.PAGES, pages);
            }

            // Skip empty entries
            if (hm.isEmpty()) {
                continue;
            }

            BibEntry b = new BibEntry(Type);
            // id assumes an existing database so don't

            // Remove empty fields:
            List<Object> toRemove = new ArrayList<>();
            for (Map.Entry<String, String> field : hm.entrySet()) {
                String content = field.getValue();
                if ((content == null) || content.trim().isEmpty()) {
                    toRemove.add(field.getKey());
                }
            }
            for (Object aToRemove : toRemove) {
                hm.remove(aToRemove);

            }

            // Polish entries
            IsiImporter.processSubSup(hm);
            IsiImporter.processCapitalization(hm);

            b.setField(hm);

            bibitems.add(b);
        }
        return new ParserResult(bibitems);
    }

    private static String parsePages(String value) {
        int lastDash = value.lastIndexOf('-');
        return value.substring(0, lastDash) + "--" + value.substring(lastDash + 1);
    }

    public static String parseMonth(String value) {

        String[] parts = value.split("\\s|\\-");
        for (String part1 : parts) {
            Optional<Month> month = Month.getMonthByShortName(part1.toLowerCase(Locale.ROOT));
            if (month.isPresent()) {
                return month.get().getJabRefFormat();
            }
        }

        // Try two digit month
        for (String part : parts) {
            try {
                int number = Integer.parseInt(part);
                Optional<Month>  month = Month.getMonthByNumber(number);
                if (month.isPresent()) {
                    return month.get().getJabRefFormat();
                }
            } catch (NumberFormatException ignored) {
                // Ignored
            }
        }
        return null;
    }

    /**
     * Will expand ISI first names.
     * <p>
     * Fixed bug from:
     * http://sourceforge.net/tracker/index.php?func=detail&aid=1542552&group_id=92314&atid=600306
     */
    public static String isiAuthorConvert(String author) {

        String[] s = author.split(",");
        if (s.length != 2) {
            return author;
        }

        StringBuilder sb = new StringBuilder();

        String last = s[0].trim();
        sb.append(last).append(", ");

        String first = s[1].trim();

        String[] firstParts = first.split("\\s+");

        for (int i = 0; i < firstParts.length; i++) {

            first = firstParts[i];

            // Do we have only uppercase chars?
            if (first.toUpperCase(Locale.ROOT).equals(first)) {
                first = first.replace(".", "");
                for (int j = 0; j < first.length(); j++) {
                    sb.append(first.charAt(j)).append('.');

                    if (j < (first.length() - 1)) {
                        sb.append(' ');
                    }
                }
            } else {
                sb.append(first);
            }
            if (i < (firstParts.length - 1)) {
                sb.append(' ');
            }
        }
        return sb.toString();

    }

    private static String[] isiAuthorsConvert(String[] authors) {

        String[] result = new String[authors.length];
        for (int i = 0; i < result.length; i++) {
            result[i] = IsiImporter.isiAuthorConvert(authors[i]);
        }
        return result;
    }

    public static String isiAuthorsConvert(String authors) {
        String[] s = IsiImporter.isiAuthorsConvert(authors.split(" and |;"));
        return String.join(" and ", s);
    }

}