package net.sf.jabref.imports;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.Vector;
import net.sf.jabref.BibtexEntry;
import net.sf.jabref.Globals;
import net.sf.jabref.Util;
public class TextAnalyzer {
BibtexEntry be = null;
public TextAnalyzer(String text) {
guessBibtexFields(text);
}
public BibtexEntry getEntry() {
return be;
}
public void guessBibtexFields(String text) {
TreeSet<Substring> usedParts = new TreeSet<Substring>();
text = " "+text+" ";
String[] split = null;
// Look for the year:
String year = null;
String yearRx = "(\\s|\\()\\d\\d\\d\\d(\\.|,|\\))";
String[] cand = getMatches(text, yearRx);
if (cand.length == 1) {
// Only one four-digit number, so we guess that is the year.
year = clean(cand[0]);
int pos = text.indexOf(year);
usedParts.add(new Substring("year", pos, pos+year.length()));
Util.pr("Guessing 'year': '"+year+"'");
} else if (cand.length > 1) {
// More than one four-digit numbers, so we look for one giving a reasonable year:
int good = -1, yearFound = -1;
find: for (int i=0; i<cand.length; i++) {
int number = Integer.parseInt(cand[i].trim());
if (number == yearFound)
continue find;
if (number < 2500) {
if (good == -1) {
good = i;
yearFound = number;
} else {
// More than one found. Be a bit more specific.
if ((yearFound < Globals.FUTURE_YEAR) && (number < Globals.FUTURE_YEAR)) {
good = -1;
break find; // Give up, both seem good enough.
}
else if ((yearFound >= Globals.FUTURE_YEAR) && (number < Globals.FUTURE_YEAR)) {
good = i;
yearFound = number;
}
}
}
}
if (good >= 0) {
year = clean(cand[good]);
int pos = text.indexOf(year);
usedParts.add(new Substring("year", pos, pos+year.length()));
Util.pr("Guessing 'year': '"+year+"'");
}
}
// Look for Pages:
String pages = null;
String pagesRx = "\\s(\\d{1,4})( ??)-( ??)(\\d{1,4})(\\.|,|\\s)";
cand = getMatches(text, pagesRx);
if (cand.length == 1) {
pages = clean(cand[0].replaceAll("-|( - )", "--"));
int pos = text.indexOf(cand[0]);
usedParts.add(new Substring("pages", pos, pos+year.length()));
Util.pr("Guessing 'pages': '" + pages + "'");
} else if (cand.length > 1) {
int found = -1;
checkScope: for (int i=0; i<cand.length; i++) {
split = clean(cand[i].replaceAll("\\s", "")).split("-");
// Util.pr("Pg: "+pages);
int first = Integer.parseInt(split[0]),
second = Integer.parseInt(split[1]);
if (second-first > 3) {
found = i;
break checkScope;
}
}
if (found >= 0) {
pages = clean(cand[found].replaceAll("-|( - )", "--"));
int pos = text.indexOf(cand[found]);
Util.pr("Guessing 'pages': '" + pages + "'");
usedParts.add(new Substring("pages", pos, pos+pages.length()));
}
}
//String journalRx = "(\\.|\\n)\\s??([a-zA-Z\\. ]{8,30}+)((vol\\.|Vol\\.|Volume|volume))??(.??)(\\d{1,3})(\\.|,|\\s)";
String journal = null,
volume = null;
String journalRx = "(,|\\.|\\n)\\s??([a-zA-Z\\. ]{8,30}+)((.){0,2})((vol\\.|Vol\\.|Volume|volume))??\\s??(\\d{1,3})(\\.|,|\\s|:)";
cand = getMatches(text, journalRx);
if (cand.length > 0) {
//Util.pr("guessing 'journal': '" + cand[0] + "'");
cand[0] = cand[0].trim();
int pos = cand[0].lastIndexOf(' ');
if (pos > 0) {
volume = clean(cand[0].substring(pos+1));
Util.pr("Guessing 'volume': '" + volume + "'");
journal = clean(cand[0].substring(0, pos));
//Util.pr("guessing 'journal': '" + journal + "'");
pos = journal.lastIndexOf(' ');
if (pos > 0) {
String last = journal.substring(pos+1).toLowerCase();
if (last.equals("volume") || last.equals("vol") || last.equals("v"))
journal = clean(journal.substring(0, pos));
}
pos = text.indexOf(journal);
usedParts.add(new Substring("journal", pos, pos+journal.length()));
Util.pr("Guessing 'journal': '" + journal + "'");
}
//Util.pr("Journal? '"+cand[0]+"'");
} else {
// No journal found. Maybe the year precedes the volume? Try another regexp:
journalRx = "(,|\\.|\\n)\\s??([a-zA-Z\\. ]{8,30}+)((.){0,2})\\s??(\\d{1,3})(\\.|,|\\s|:)";
}
// Then try to find title and authors.
Substring ss;
Vector<String> free = new Vector<String>();
int piv = 0;
for (Iterator<Substring> i=usedParts.iterator(); i.hasNext();) {
ss = i.next();
if (ss.begin()-piv > 10) {
Util.pr("... "+text.substring(piv, ss.begin()));
free.add(clean(text.substring(piv, ss.begin())));
}
piv = ss.end();
}
if (text.length()-piv > 10) {
free.add(clean(text.substring(piv)));
}
Util.pr("Free parts:");
for (String s : free){
Util.pr(": '"+s+"'");
}
}
public String[] getMatches(String text, String regexp) {
int piv = 0;
String[] test = text.split(regexp);
if (test.length < 2)
return new String[0];
String[] out = new String[test.length-1];
for (int i=0; i<out.length; i++) {
String[] curr = text.split(regexp, i+2);
out[i] = text.substring(piv+curr[i].length(), text.length()-curr[i+1].length());
piv += curr[i].length()+out[i].length();
//Util.pr("--"+out[i]+"\n-> "+piv);
}
return out;
}
private String clean(String s) {
boolean found = false;
int left = 0, right = s.length()-1;
while (!found && left<s.length()) {
char c = s.charAt(left);
if (Character.isWhitespace(c) || (c=='.') || (c==',') || (c=='(')
|| (c==':') || (c==')'))
left++;
else
found = true;
}
found = false;
while (!found && right>left) {
char c = s.charAt(right);
if (Character.isWhitespace(c) || (c=='.') || (c==',') || (c==')')
|| (c==':') || (c=='('))
right--;
else
found = true;
}
//Util.pr(s+"\n"+left+" "+right);
return s.substring(left, Math.min(right+1, s.length()));
}
private class Substring implements Comparable<Substring> {
int begin, end;
public Substring(String name, int begin, int end) {
this.begin = begin;
this.end = end;
}
public int begin() {
return begin;
}
public int end() {
return end;
}
public int compareTo(Substring other) {
return (new Integer(begin)).compareTo(new Integer(other.begin()));
}
}
}