IsiImporter.java example

Explorer
Docear-master
package net.sf.jabref.imports;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import net.sf.jabref.BibtexEntry;
import net.sf.jabref.BibtexFields;
import net.sf.jabref.Globals;
import net.sf.jabref.Util;
import net.sf.jabref.util.CaseChanger;

/**
 * Importer for the ISI Web of Science, INSPEC and Medline format.
 * 
 * Documentation about ISI WOS format:
 * 
 * <ul>
 * <li>http://wos.isitrial.com/help/helpprn.html</li>
 * </ul>
 * 
 * <ul>
 * <li>Check compatibility with other ISI2Bib tools like:
 * http://www-lab.imr.tohoku.ac.jp/~t-nissie/computer/software/isi/ or
 * http://www.tug.org/tex-archive/biblio/bibtex/utils/isi2bibtex/isi2bibtex or
 * http://web.mit.edu/emilio/www/utils.html</li>
 * <li>Deal with capitalization correctly</li>
 * </ul>
 * 
 * @author $Author: mortenalver $
 * @version $Revision: 3047 $ ($Date: 2009-08-21 18:32:56 +0200 (Fr, 21 Aug 2009) $)
 * 
 */
public class IsiImporter extends ImportFormat {
	/**
	 * Return the name of this import format.
	 */
	public String getFormatName() {
		return "ISI";
	}

	/*
	 * (non-Javadoc)
	 * 
	 * @see net.sf.jabref.imports.ImportFormat#getCLIId()
	 */
	public String getCLIId() {
		return "isi";
	}

    // 2006.09.05: Modified pattern to avoid false positives for other files due to an
    // extra | at the end:
    static final Pattern isiPattern = Pattern.compile("FN ISI Export Format|VR 1.|PY \\d{4}");

	/**
	 * Check whether the source is in the correct format for this importer.
	 */
	public boolean isRecognizedFormat(InputStream stream) throws IOException {

		BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));

		String str;
        int i=0;
        while (((str = in.readLine()) != null) && (i < 50)) {

			/**
			 * The following line gives false positives for RIS files, so it
			 * should not be uncommented. The hypen is a characteristic of the
			 * RIS format.
			 * 
			 * str = str.replace(" - ", "")
			 */
			if (isiPattern.matcher(str).find())
				return true;

            i++;
        }

		return false;
	}

	static Pattern subsupPattern = Pattern.compile("/(sub|sup)\\s+(.*?)\\s*/");

	static public void processSubSup(HashMap<String, String> map) {

		String[] subsup = { "title", "abstract", "review", "notes" };

		for (int i = 0; i < subsup.length; i++) {
			if (map.containsKey(subsup[i])) {

				Matcher m = subsupPattern.matcher(map.get(subsup[i]));
				StringBuffer sb = new StringBuffer();

				while (m.find()) {

					String group2 = m.group(2);
					group2 = group2.replaceAll("\\$", "\\\\\\\\\\\\\\$"); // Escaping
					// insanity!
					// :-)
					if (group2.length() > 1) {
						group2 = "{" + group2 + "}";
					}
					if (m.group(1).equals("sub")) {
						m.appendReplacement(sb, "\\$_" + group2 + "\\$");
					} else {
						m.appendReplacement(sb, "\\$^" + group2 + "\\$");
					}
				}
				m.appendTail(sb);
				map.put(subsup[i], sb.toString());
			}
		}
	}

	static public void processCapitalization(HashMap<String, String> map) {

		String[] subsup = { "title", "journal", "publisher" };

		for (int i = 0; i < subsup.length; i++) {

			if (map.containsKey(subsup[i])) {

				String s = map.get(subsup[i]);
                if (s.toUpperCase().equals(s)) {
                    s = CaseChanger.changeCase(s, CaseChanger.UPPER_EACH_FIRST, true);
					map.put(subsup[i], s);
				}
			}
		}
	}

	/**
	 * Parse the entries in the source, and return a List of BibtexEntry
	 * objects.
	 */
	public List<BibtexEntry> importEntries(InputStream stream) throws IOException {
		if (stream == null) {
			throw new IOException("No stream given.");
		}

		ArrayList<BibtexEntry> bibitems = new ArrayList<BibtexEntry>();
		StringBuffer sb = new StringBuffer();

		BufferedReader in = new BufferedReader(ImportFormatReader.getReaderDefaultEncoding(stream));

		// Pattern fieldPattern = Pattern.compile("^AU |^TI |^SO |^DT |^C1 |^AB
		// |^ID |^BP |^PY |^SE |^PY |^VL |^IS ");
		String str;

		while ((str = in.readLine()) != null) {
			if (str.length() < 3)
				continue;

			// begining of a new item
			if (str.substring(0, 3).equals("PT "))
				sb.append("::").append(str);
			else {
				String beg = str.substring(0, 3).trim();

				// I could have used the fieldPattern regular expression instead
				// however this seems to be
				// quick and dirty and it works!
				if (beg.length() == 2) {
					sb.append(" ## "); // mark the begining of each field
					sb.append(str);
				} else {
					sb.append("EOLEOL"); // mark the end of each line
					sb.append(str.trim()); // remove the initial spaces
				}
			}
		}

		String[] entries = sb.toString().split("::");

		HashMap<String, String> hm = new HashMap<String, String>();

		// skip the first entry as it is either empty or has document header
		for (int i = 0; i < entries.length; i++) {
			String[] fields = entries[i].split(" ## ");

			if (fields.length == 0)
				fields = entries[i].split("\n");

			String Type = "";
			String PT = "";
			String pages = "";
			hm.clear();

			nextField: for (int j = 0; j < fields.length; j++) {
				// empty field don't do anything
				if (fields[j].length() <= 2)
					continue;

				String beg = fields[j].substring(0, 2);
				String value = fields[j].substring(3);
				if (value.startsWith(" - ")) {
					value = value.substring(3);
				}
				value = value.trim();

				if (beg.equals("PT")) {
					if (value.startsWith("J")) {
						PT = "article";
					} else {
						PT = value;
					}
					Type = "article"; // make all of them PT?
				} else if (beg.equals("TY")) {
					if ("JOUR".equals(value))
						Type = "article";
					else if ("CONF".equals(value))
						Type = "inproceedings";
				} else if (beg.equals("JO"))
					hm.put("booktitle", value);
				else if (beg.equals("AU")) {
					String author = isiAuthorsConvert(value.replaceAll("EOLEOL", " and "));

					// if there is already someone there then append with "and"
					if (hm.get("author") != null)
						author = hm.get("author") + " and " + author;

					hm.put("author", author);
				} else if (beg.equals("TI"))
					hm.put("title", value.replaceAll("EOLEOL", " "));
				else if (beg.equals("SO") || beg.equals("JA"))
					hm.put("journal", value.replaceAll("EOLEOL", " "));
				else if (beg.equals("ID") || beg.equals("KW")) {
				
					value = value.replaceAll("EOLEOL", " ");
					String existingKeywords = hm.get("keywords");
					if (existingKeywords != null && existingKeywords.indexOf(value) == -1) {
						existingKeywords += ", " + value;
					} else {
						existingKeywords = value;
					}
					hm.put("keywords", existingKeywords);

				} else if (beg.equals("AB"))
					hm.put("abstract", value.replaceAll("EOLEOL", " "));
				else if (beg.equals("BP") || beg.equals("BR") || beg.equals("SP"))
					pages = value;
				else if (beg.equals("EP")) {
					int detpos = value.indexOf(' ');

					// tweak for IEEE Explore
					if (detpos != -1 && value.substring(0, detpos).trim().length() > 0)
						value = value.substring(0, detpos);

					pages = pages + "--" + value;
				} else if (beg.equals("PS")) {
					pages = parsePages(value);
				} else if (beg.equals("AR"))
					pages = value;
				else if (beg.equals("IS"))
					hm.put("number", value);
				else if (beg.equals("PY"))
					hm.put("year", value);
				else if (beg.equals("VL"))
					hm.put("volume", value);
				else if (beg.equals("PU"))
					hm.put("publisher", value);
                else if (beg.equals("DI"))
                    hm.put("doi", value);
				else if (beg.equals("PD")) {

					String month = parseMonth(value);
					if (month != null) {
						hm.put("month", month);
						continue nextField;
					}

				} else if (beg.equals("DT")) {
					Type = value;
					if (Type.equals("Review")) {
						Type = "article"; // set "Review" in Note/Comment?
					} else if (Type.startsWith("Article") || Type.startsWith("Journal")
						|| PT.equals("article")) {
						Type = "article";
						continue;
					} else {
						Type = "misc";
					}
				} else if (beg.equals("CR")) {
					hm.put("CitedReferences", value.replaceAll("EOLEOL", " ; ").trim());
				} else {
					// Preserve all other entries except
					if (beg.equals("ER") || beg.equals("EF") || beg.equals("VR")
						|| beg.equals("FN"))
						continue nextField;
					hm.put(beg, value);
				}
			}

			if (!"".equals(pages))
				hm.put("pages", pages);

			// Skip empty entries
			if (hm.size() == 0)
				continue;

			BibtexEntry b = new BibtexEntry(BibtexFields.DEFAULT_BIBTEXENTRY_ID, Globals
				.getEntryType(Type));
			// id assumes an existing database so don't

			// Remove empty fields:
			ArrayList<Object> toRemove = new ArrayList<Object>();
			for (Iterator<String> it = hm.keySet().iterator(); it.hasNext();) {
				Object key = it.next();
				String content = hm.get(key);
				if ((content == null) || (content.trim().length() == 0))
					toRemove.add(key);
			}
			for (Iterator<Object> iterator = toRemove.iterator(); iterator.hasNext();) {
				hm.remove(iterator.next());

			}

			// Polish entries
			processSubSup(hm);
			processCapitalization(hm);

			b.setField(hm);

			bibitems.add(b);
		}

		return bibitems;
	}

	public static String parsePages(String value) {
		int lastDash = value.lastIndexOf("-");
		return value.substring(0, lastDash) + "--" + value.substring(lastDash + 1);
	}

	public static String parseMonth(String value) {

		String[] parts = value.split("\\s|\\-");
		for (int ii = 0; ii < parts.length; ii++) {
			if (Globals.MONTH_STRINGS.containsKey(parts[ii].toLowerCase())) {
				return "#" + parts[ii].toLowerCase() + "#";
			}
		}

		// Try two digit month
		for (int ii = 0; ii < parts.length; ii++) {
			int number;
			try {
				number = Integer.parseInt(parts[ii]);
				if (number >= 1 && number <= 12) {
					return "#" + Globals.MONTHS[number - 1] + "#";
				}
			} catch (NumberFormatException e) {

			}
		}
		return null;
	}

	/**
	 * Will expand ISI first names.
	 * 
	 * Fixed bug from:
	 * http://sourceforge.net/tracker/index.php?func=detail&aid=1542552&group_id=92314&atid=600306
	 * 
	 */
	public static String isiAuthorConvert(String author) {

		String[] s = author.split(",");
		if (s.length != 2)
			return author;

		StringBuffer sb = new StringBuffer();

		String last = s[0].trim();
		sb.append(last).append(", ");

		String first = s[1].trim();

		String[] firstParts = first.split("\\s+");

		for (int i = 0; i < firstParts.length; i++) {

			first = firstParts[i];

			// Do we have only uppercase chars?
			if (first.toUpperCase().equals(first)) {
				first = first.replaceAll("\\.", "");
				for (int j = 0; j < first.length(); j++) {
					sb.append(first.charAt(j)).append(".");

					if (j < first.length() - 1)
						sb.append(" ");
				}
			} else {
				sb.append(first);
			}
			if (i < firstParts.length - 1) {
				sb.append(" ");
			}
		}
		return sb.toString();

	}

	public static String[] isiAuthorsConvert(String[] authors) {

		String[] result = new String[authors.length];
		for (int i = 0; i < result.length; i++) {
			result[i] = isiAuthorConvert(authors[i]);
		}
		return result;
	}

	public static String isiAuthorsConvert(String authors) {
		String[] s = isiAuthorsConvert(authors.split(" and |;"));
		return Util.join(s, " and ", 0, s.length);
	}

}