OvidImporter.java example

Explorer
jabref-2.9.2-master
- src
/*  Copyright (C) 2003-2011 JabRef contributors.
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package net.sf.jabref.imports;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import net.sf.jabref.BibtexEntry;
import net.sf.jabref.Globals;
import net.sf.jabref.OutputPrinter;
import net.sf.jabref.Util;
import net.sf.jabref.AuthorList;

/**
 * Imports an Ovid file.
 */
public class OvidImporter extends ImportFormat {

    public static Pattern ovid_src_pat = Pattern
    .compile("Source ([ \\w&\\-,:]+)\\.[ ]+([0-9]+)\\(([\\w\\-]+)\\):([0-9]+\\-?[0-9]+?)\\,.*([0-9][0-9][0-9][0-9])");

    public static Pattern ovid_src_pat_no_issue = Pattern
    .compile("Source ([ \\w&\\-,:]+)\\.[ ]+([0-9]+):([0-9]+\\-?[0-9]+?)\\,.*([0-9][0-9][0-9][0-9])");

    public static Pattern ovid_src_pat_2 = Pattern.compile(
            "([ \\w&\\-,]+)\\. Vol ([0-9]+)\\(([\\w\\-]+)\\) ([A-Za-z]+) ([0-9][0-9][0-9][0-9]), ([0-9]+\\-?[0-9]+)");

    public static Pattern incollection_pat = Pattern.compile(
            "(.+)\\(([0-9][0-9][0-9][0-9])\\)\\. ([ \\w&\\-,:]+)\\.[ ]+\\(pp. ([0-9]+\\-?[0-9]+?)\\).[A-Za-z0-9, ]+pp\\. "
            +"([\\w, ]+): ([\\w, ]+)");
    public static Pattern book_pat = Pattern.compile(
                "\\(([0-9][0-9][0-9][0-9])\\)\\. [A-Za-z, ]+([0-9]+) pp\\. ([\\w, ]+): ([\\w, ]+)");

    //   public static Pattern ovid_pat_inspec= Pattern.compile("Source ([
    // \\w&\\-]+)");


    /**
     * Return the name of this import format.
     */
    public String getFormatName() {
    return "Ovid";
    }

    /*
     *  (non-Javadoc)
     * @see net.sf.jabref.imports.ImportFormat#getCLIId()
     */
    public String getCLIId() {
      return "ovid";
    }

    static final Pattern ovidPattern = Pattern.compile("<[0-9]+>");

    /**
     * Check whether the source is in the correct format for this importer.
     */
    public boolean isRecognizedFormat(InputStream stream) throws IOException {

        BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
        String str;
        int i=0;
        while (((str = in.readLine()) != null) && (i < 50)) {

			if (ovidPattern.matcher(str).find())
				return true;

            i++;
        }

		return false;
    }

    /**
     * Parse the entries in the source, and return a List of BibtexEntry
     * objects.
     */
    public List<BibtexEntry> importEntries(InputStream stream, OutputPrinter status) throws IOException {
    ArrayList<BibtexEntry> bibitems = new ArrayList<BibtexEntry>();
    StringBuffer sb = new StringBuffer();
    BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));
    String line;
    while ((line = in.readLine()) != null){
        if (line.length() > 0 && line.charAt(0) != ' '){
            sb.append("__NEWFIELD__");
        }
        sb.append(line);
        sb.append('\n');
    }

    String items[] = sb.toString().split("<[0-9]+>");

    for (int i = 1; i < items.length; i++){
        HashMap<String, String> h = new HashMap<String, String>();
        String[] fields = items[i].split("__NEWFIELD__");
        for (int j = 0; j < fields.length; j++){
            int linebreak = fields[j].indexOf('\n');
            String fieldName = fields[j].substring(0, linebreak).trim();
            String content = fields[j].substring(linebreak).trim();

            // Check if this is the author field (due to a minor special treatment for this field):
            boolean isAuthor = fieldName.indexOf("Author") == 0
                && fieldName.indexOf("Author Keywords") == -1
                && fieldName.indexOf("Author e-mail") == -1;

            // Remove unnecessary dots at the end of lines, unless this is the author field,
            // in which case a dot at the end could be significant:
            if (!isAuthor && content.endsWith("."))
                    content = content.substring(0, content.length()-1);
            //fields[j] = fields[j].trim();
            if (isAuthor) {

                h.put("author", content);


        }else if (fieldName.indexOf("Title") == 0) {
                content = content.replaceAll("\\[.+\\]", "").trim();
                if (content.endsWith("."))
                    content = content.substring(0, content.length()-1);
                h.put("title", content);
        }

        else if (fieldName.indexOf("Chapter Title") == 0) h.put("chaptertitle", content);

        // The "Source" field is a complete mess - it can have several different formats,
        // but since it can contain journal name, book title, year, month, volume etc. we
        // must try to parse it. We use different regular expressions to check different
        // possible formattings.
        else if (fieldName.indexOf("Source") == 0){
                Matcher matcher;
            if ((matcher = ovid_src_pat.matcher(content)).find()) {
            h.put("journal", matcher.group(1));
            h.put("volume", matcher.group(2));
            h.put("issue", matcher.group(3));
            h.put("pages", matcher.group(4));
            h.put("year", matcher.group(5));
            } else if ((matcher = ovid_src_pat_no_issue.matcher(content)).find()) {// may be missing the issue
                h.put("journal", matcher.group(1));
                h.put("volume", matcher.group(2));
                h.put("pages", matcher.group(3));
                h.put("year", matcher.group(4));
            } else if ((matcher = ovid_src_pat_2.matcher(content)).find()) {

                h.put("journal", matcher.group(1));
                h.put("volume", matcher.group(2));
                h.put("issue", matcher.group(3));
                h.put("month", matcher.group(4));
                h.put("year", matcher.group(5));
                h.put("pages", matcher.group(6));

            } else if ((matcher = incollection_pat.matcher(content)).find()) {
                h.put("editor", matcher.group(1).replaceAll(" \\(Ed\\)", ""));
                h.put("year", matcher.group(2));
                h.put("booktitle", matcher.group(3));
                h.put("pages", matcher.group(4));
                h.put("address", matcher.group(5));
                h.put("publisher", matcher.group(6));
            } else if ((matcher = book_pat.matcher(content)).find()) {
                h.put("year", matcher.group(1));
                h.put("pages", matcher.group(2));
                h.put("address", matcher.group(3));
                h.put("publisher", matcher.group(4));

            }
            // Add double hyphens to page ranges:
            if (h.get("pages") != null) {
                h.put("pages", h.get("pages").replaceAll("-", "--"));
            }

        } else if (fieldName.equals("Abstract")) {
                h.put("abstract", content);

        } else if (fieldName.equals("Publication Type")) {
             if (content.indexOf("Book") >= 0)
                h.put("entrytype", "book");
             else if (content.indexOf("Journal") >= 0)
                h.put("entrytype", "article");
             else if (content.indexOf("Conference Paper") >= 0)
                h.put("entrytype", "inproceedings");
        }
        }

        // Now we need to check if a book entry has given editors in the author field;
        // if so, rearrange:
        String auth = h.get("author");
        if ((auth != null) && (auth.indexOf(" [Ed]") >= 0)) {
            h.remove("author");
            h.put("editor", auth.replaceAll(" \\[Ed\\]", ""));
        }

        // Rearrange names properly:
        auth = h.get("author");
        if (auth != null)
            h.put("author", fixNames(auth));
        auth = h.get("editor");
        if (auth != null)
            h.put("editor", fixNames(auth));



        // Set the entrytype properly:
        String entryType = h.containsKey("entrytype") ? h.get("entrytype") : "other";
        h.remove("entrytype");
        if (entryType.equals("book")) {
            if (h.containsKey("chaptertitle")) {
                // This means we have an "incollection" entry.
                entryType = "incollection";
                // Move the "chaptertitle" to just "title":
                h.put("title", h.remove("chaptertitle"));
            }
        }
        BibtexEntry b = new BibtexEntry(Util.createNeutralId(), Globals.getEntryType(entryType));
        b.setField(h);

        bibitems.add(b);

    }

    return bibitems;
    }

    /**
     * Convert a string of author names into a BibTeX-compatible format.
     * @param content The name string.
     * @return The formatted names.
     */
    private String fixNames(String content) {
        String names;
        if (content.indexOf(";") > 0){ //LN FN; [LN FN;]*
            names = content.replaceAll("[^\\.A-Za-z,;\\- ]", "").replaceAll(";", " and");
        } else if (content.indexOf("  ") > 0) {
            String[] sNames = content.split("  ");
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < sNames.length; i++) {
                if (i > 0) sb.append(" and ");
                sb.append(sNames[i].replaceFirst(" ", ", "));
            }
            names = sb.toString();
        } else
            names = content;
        return AuthorList.fixAuthor_lastNameFirst(names);
    }

}