package org.jabref.logic.importer.fileformat;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jabref.logic.formatter.casechanger.TitleCaseFormatter;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.util.FileExtensions;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.FieldName;
import org.jabref.model.entry.Month;
/**
* Importer for the ISI Web of Science, INSPEC and Medline format.
* <p>
* Documentation about ISI WOS format:
* <p>
* <ul>
* <li>http://wos.isitrial.com/help/helpprn.html</li>
* </ul>
* <p>
* <ul>
* <li>Check compatibility with other ISI2Bib tools like:
* http://www-lab.imr.tohoku.ac.jp/~t-nissie/computer/software/isi/ or
* http://www.tug.org/tex-archive/biblio/bibtex/utils/isi2bibtex/isi2bibtex or
* http://web.mit.edu/emilio/www/utils.html</li>
* <li>Deal with capitalization correctly</li>
* </ul>
*/
public class IsiImporter extends Importer {
private static final Pattern SUB_SUP_PATTERN = Pattern.compile("/(sub|sup)\\s+(.*?)\\s*/");
// 2006.09.05: Modified pattern to avoid false positives for other files due to an
// extra | at the end:
private static final Pattern ISI_PATTERN = Pattern.compile("FN ISI Export Format|VR 1.|PY \\d{4}");
@Override
public String getName() {
return "ISI";
}
@Override
public FileExtensions getExtensions() {
return FileExtensions.ISI;
}
@Override
public String getId() {
return "isi";
}
@Override
public String getDescription() {
return "Importer for the ISI Web of Science, INSPEC and Medline format.";
}
@Override
public boolean isRecognizedFormat(BufferedReader reader) throws IOException {
String str;
int i = 0;
while (((str = reader.readLine()) != null) && (i < 50)) {
/**
* The following line gives false positives for RIS files, so it
* should not be uncommented. The hypen is a characteristic of the
* RIS format.
*
* str = str.replace(" - ", "")
*/
if (IsiImporter.ISI_PATTERN.matcher(str).find()) {
return true;
}
i++;
}
return false;
}
public static void processSubSup(Map<String, String> map) {
String[] subsup = {FieldName.TITLE, FieldName.ABSTRACT, FieldName.REVIEW, "notes"};
for (String aSubsup : subsup) {
if (map.containsKey(aSubsup)) {
Matcher m = IsiImporter.SUB_SUP_PATTERN.matcher(map.get(aSubsup));
StringBuffer sb = new StringBuffer();
while (m.find()) {
String group2 = m.group(2);
group2 = group2.replaceAll("\\$", "\\\\\\\\\\\\\\$"); // Escaping
// insanity!
// :-)
if (group2.length() > 1) {
group2 = "{" + group2 + "}";
}
if ("sub".equals(m.group(1))) {
m.appendReplacement(sb, "\\$_" + group2 + "\\$");
} else {
m.appendReplacement(sb, "\\$^" + group2 + "\\$");
}
}
m.appendTail(sb);
map.put(aSubsup, sb.toString());
}
}
}
private static void processCapitalization(Map<String, String> map) {
String[] subsup = {FieldName.TITLE, FieldName.JOURNAL, FieldName.PUBLISHER};
for (String aSubsup : subsup) {
if (map.containsKey(aSubsup)) {
String s = map.get(aSubsup);
if (s.toUpperCase(Locale.ROOT).equals(s)) {
s = new TitleCaseFormatter().format(s);
map.put(aSubsup, s);
}
}
}
}
@Override
public ParserResult importDatabase(BufferedReader reader) throws IOException {
Objects.requireNonNull(reader);
List<BibEntry> bibitems = new ArrayList<>();
StringBuilder sb = new StringBuilder();
// Pattern fieldPattern = Pattern.compile("^AU |^TI |^SO |^DT |^C1 |^AB
// |^ID |^BP |^PY |^SE |^PY |^VL |^IS ");
String str;
while ((str = reader.readLine()) != null) {
if (str.length() < 3) {
continue;
}
// beginning of a new item
if ("PT ".equals(str.substring(0, 3))) {
sb.append("::").append(str);
} else {
String beg = str.substring(0, 3).trim();
// I could have used the fieldPattern regular expression instead
// however this seems to be
// quick and dirty and it works!
if (beg.length() == 2) {
sb.append(" ## "); // mark the beginning of each field
sb.append(str);
} else {
sb.append("EOLEOL"); // mark the end of each line
sb.append(str.trim()); // remove the initial spaces
}
}
}
String[] entries = sb.toString().split("::");
Map<String, String> hm = new HashMap<>();
// skip the first entry as it is either empty or has document header
for (String entry : entries) {
String[] fields = entry.split(" ## ");
if (fields.length == 0) {
fields = entry.split("\n");
}
String Type = "";
String PT = "";
String pages = "";
hm.clear();
for (String field : fields) {
// empty field don't do anything
if (field.length() <= 2) {
continue;
}
String beg = field.substring(0, 2);
String value = field.substring(3);
if (value.startsWith(" - ")) {
value = value.substring(3);
}
value = value.trim();
if ("PT".equals(beg)) {
if (value.startsWith("J")) {
PT = "article";
} else {
PT = value;
}
Type = "article"; // make all of them PT?
} else if ("TY".equals(beg)) {
if ("JOUR".equals(value)) {
Type = "article";
} else if ("CONF".equals(value)) {
Type = "inproceedings";
}
} else if ("JO".equals(beg)) {
hm.put(FieldName.BOOKTITLE, value);
} else if ("AU".equals(beg)) {
String author = IsiImporter.isiAuthorsConvert(value.replace("EOLEOL", " and "));
// if there is already someone there then append with "and"
if (hm.get(FieldName.AUTHOR) != null) {
author = hm.get(FieldName.AUTHOR) + " and " + author;
}
hm.put(FieldName.AUTHOR, author);
} else if ("TI".equals(beg)) {
hm.put(FieldName.TITLE, value.replace("EOLEOL", " "));
} else if ("SO".equals(beg) || "JA".equals(beg)) {
hm.put(FieldName.JOURNAL, value.replace("EOLEOL", " "));
} else if ("ID".equals(beg) || "KW".equals(beg)) {
value = value.replace("EOLEOL", " ");
String existingKeywords = hm.get(FieldName.KEYWORDS);
if ((existingKeywords == null) || existingKeywords.contains(value)) {
existingKeywords = value;
} else {
existingKeywords += ", " + value;
}
hm.put(FieldName.KEYWORDS, existingKeywords);
} else if ("AB".equals(beg)) {
hm.put(FieldName.ABSTRACT, value.replace("EOLEOL", " "));
} else if ("BP".equals(beg) || "BR".equals(beg) || "SP".equals(beg)) {
pages = value;
} else if ("EP".equals(beg)) {
int detpos = value.indexOf(' ');
// tweak for IEEE Explore
if ((detpos != -1) && !value.substring(0, detpos).trim().isEmpty()) {
value = value.substring(0, detpos);
}
pages = pages + "--" + value;
} else if ("PS".equals(beg)) {
pages = IsiImporter.parsePages(value);
} else if ("AR".equals(beg)) {
pages = value;
} else if ("IS".equals(beg)) {
hm.put(FieldName.NUMBER, value);
} else if ("PY".equals(beg)) {
hm.put(FieldName.YEAR, value);
} else if ("VL".equals(beg)) {
hm.put(FieldName.VOLUME, value);
} else if ("PU".equals(beg)) {
hm.put(FieldName.PUBLISHER, value);
} else if ("DI".equals(beg)) {
hm.put(FieldName.DOI, value);
} else if ("PD".equals(beg)) {
String month = IsiImporter.parseMonth(value);
if (month != null) {
hm.put(FieldName.MONTH, month);
}
} else if ("DT".equals(beg)) {
Type = value;
if ("Review".equals(Type)) {
Type = "article"; // set "Review" in Note/Comment?
} else if (Type.startsWith("Article") || Type.startsWith("Journal") || "article".equals(PT)) {
Type = "article";
} else {
Type = BibEntry.DEFAULT_TYPE;
}
} else if ("CR".equals(beg)) {
hm.put("CitedReferences", value.replace("EOLEOL", " ; ").trim());
} else {
// Preserve all other entries except
if ("ER".equals(beg) || "EF".equals(beg) || "VR".equals(beg) || "FN".equals(beg)) {
continue;
}
hm.put(beg.toLowerCase(Locale.ROOT), value);
}
}
if (!"".equals(pages)) {
hm.put(FieldName.PAGES, pages);
}
// Skip empty entries
if (hm.isEmpty()) {
continue;
}
BibEntry b = new BibEntry(Type);
// id assumes an existing database so don't
// Remove empty fields:
List<Object> toRemove = new ArrayList<>();
for (Map.Entry<String, String> field : hm.entrySet()) {
String content = field.getValue();
if ((content == null) || content.trim().isEmpty()) {
toRemove.add(field.getKey());
}
}
for (Object aToRemove : toRemove) {
hm.remove(aToRemove);
}
// Polish entries
IsiImporter.processSubSup(hm);
IsiImporter.processCapitalization(hm);
b.setField(hm);
bibitems.add(b);
}
return new ParserResult(bibitems);
}
private static String parsePages(String value) {
int lastDash = value.lastIndexOf('-');
return value.substring(0, lastDash) + "--" + value.substring(lastDash + 1);
}
public static String parseMonth(String value) {
String[] parts = value.split("\\s|\\-");
for (String part1 : parts) {
Optional<Month> month = Month.getMonthByShortName(part1.toLowerCase(Locale.ROOT));
if (month.isPresent()) {
return month.get().getJabRefFormat();
}
}
// Try two digit month
for (String part : parts) {
try {
int number = Integer.parseInt(part);
Optional<Month> month = Month.getMonthByNumber(number);
if (month.isPresent()) {
return month.get().getJabRefFormat();
}
} catch (NumberFormatException ignored) {
// Ignored
}
}
return null;
}
/**
* Will expand ISI first names.
* <p>
* Fixed bug from:
* http://sourceforge.net/tracker/index.php?func=detail&aid=1542552&group_id=92314&atid=600306
*/
public static String isiAuthorConvert(String author) {
String[] s = author.split(",");
if (s.length != 2) {
return author;
}
StringBuilder sb = new StringBuilder();
String last = s[0].trim();
sb.append(last).append(", ");
String first = s[1].trim();
String[] firstParts = first.split("\\s+");
for (int i = 0; i < firstParts.length; i++) {
first = firstParts[i];
// Do we have only uppercase chars?
if (first.toUpperCase(Locale.ROOT).equals(first)) {
first = first.replace(".", "");
for (int j = 0; j < first.length(); j++) {
sb.append(first.charAt(j)).append('.');
if (j < (first.length() - 1)) {
sb.append(' ');
}
}
} else {
sb.append(first);
}
if (i < (firstParts.length - 1)) {
sb.append(' ');
}
}
return sb.toString();
}
private static String[] isiAuthorsConvert(String[] authors) {
String[] result = new String[authors.length];
for (int i = 0; i < result.length; i++) {
result[i] = IsiImporter.isiAuthorConvert(authors[i]);
}
return result;
}
public static String isiAuthorsConvert(String authors) {
String[] s = IsiImporter.isiAuthorsConvert(authors.split(" and |;"));
return String.join(" and ", s);
}
}