package ecologylab.serialization.deserializers.parsers.bibtex;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import ecologylab.serialization.ElementState;
import ecologylab.serialization.FieldTypes;
import ecologylab.serialization.SIMPLTranslationException;
import ecologylab.serialization.SimplTypesScope;
import ecologylab.serialization.deserializers.parsers.bibtex.entrytypes.AbstractBibTeXEntry;
import ecologylab.serialization.deserializers.parsers.bibtex.entrytypes.BibTeXInProceedings;
import ecologylab.serialization.formatenums.StringFormat;
/**
* The BibTeX parser class.
*
* @author quyin
*
*/
public class BibTeXParser implements FieldTypes
{
static enum State
{
START, TYPE, BODY_START, KEY_START, KEY, KEY_FINISH, TAG_START, TAG, VALUE_START, VALUE, STOP
};
BibTeXEvents eventListener;
State state;
/**
* Constructor.
*
* @param eventListener
* An event listener to handle parsing events.
*
*/
public BibTeXParser(BibTeXEvents eventListener)
{
this.eventListener = eventListener;
}
/**
* The entry method to parse BibTeX for deserialization. Accept a char array. Output a list of
* ElementState (which has been annotated with S.IM.PL's bibtex annotations) since one BibTeX file
* can contain multiple entries.
* <p />
* The parsing process is controlled by a DFA.
*
* @param data
* @return
* @throws BibTeXFormatException
*/
public <ES extends ElementState> List<ES> parse(char[] data) throws BibTeXFormatException
{
List<ES> rst = new ArrayList<ES>();
if (data.length <= 0)
return rst;
eventListener.startBibTeX();
state = State.START;
int p = 0;
int valueStart = 0;
while (p < data.length)
{
switch (state)
{
case START:
if (data[p] != '@')
{
++p;
}
else if (data[p] == '@')
{
state = State.TYPE;
++p;
valueStart = p;
}
else
throw new BibTeXFormatException(data, p, "expecting whitespaces or '@'.");
break;
case TYPE:
if (!(Character.isWhitespace((int) data[p])) && data[p] != '{')
{
++p;
}
else if (Character.isWhitespace((int) data[p]))
{
state = State.BODY_START;
eventListener.startEntity(new String(data, valueStart, p - valueStart));
++p;
}
else if (data[p] == '{')
{
state = State.KEY_START;
eventListener.startEntity(new String(data, valueStart, p - valueStart));
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting letters, whitespaces or '{'.");
break;
case BODY_START:
if (Character.isWhitespace((int) data[p]))
{
++p;
}
else if (data[p] == '{')
{
state = State.KEY_START;
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting whitespaces or '{'.");
break;
case KEY_START:
if (Character.isWhitespace((int) data[p]))
{
state = State.KEY;
valueStart = p;
++p;
}
else if (Character.isLetterOrDigit((int) data[p]))
{
state = State.KEY;
valueStart = p;
++p;
}
else if (data[p] == ',')
{
state = State.TAG_START;
eventListener.key(new String(data, valueStart, p - valueStart));
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting letters, digits or whitespaces.");
break;
case KEY:
if (!(Character.isWhitespace((int) data[p])) && data[p] != ',')
{
++p;
}
else if (Character.isWhitespace((int) data[p]))
{
state = State.KEY_FINISH;
eventListener.key(new String(data, valueStart, p - valueStart));
++p;
}
else if (data[p] == ',')
{
state = State.TAG_START;
eventListener.key(new String(data, valueStart, p - valueStart));
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting letters, digits, whitespaces or ','.");
break;
case KEY_FINISH:
if (Character.isWhitespace((int) data[p]))
{
++p;
}
else if (data[p] == ',')
{
state = State.TAG_START;
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting whitespaces or ','.");
break;
case TAG_START:
if (Character.isWhitespace((int) data[p]))
{
++p;
}
else if (Character.isLetterOrDigit((int) data[p]))
{
state = State.TAG;
valueStart = p;
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting letters, digits or whitespaces.");
break;
case TAG:
if (Character.isLetterOrDigit((int) data[p]) || data[p] == '_')
{
++p;
}
else if (Character.isWhitespace((int) data[p]))
{
state = State.VALUE_START;
eventListener.startTag(new String(data, valueStart, p - valueStart));
++p;
}
else if (data[p] == '=')
{
state = State.VALUE;
eventListener.startTag(new String(data, valueStart, p - valueStart));
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting letters, digits, whitespaces or '='.");
break;
case VALUE_START:
if (Character.isWhitespace((int) data[p]))
++p;
else if (data[p] == '=')
{
state = State.VALUE;
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting whitespaces or '='.");
break;
case VALUE:
StringBuilder sb = new StringBuilder();
p += readValueAndWhitespaces(data, p, sb);
eventListener.value(sb.toString());
if (data[p] == ',')
{
eventListener.endTag();
state = State.TAG_START;
++p;
}
else if (data[p] == '}')
{
eventListener.endTag();
ES object = (ES) eventListener.getBibTeXObject();
if (object != null)
rst.add(object);
eventListener.endEntity();
state = State.START;
++p;
}
else
throw new BibTeXFormatException(data, p, "expecting ',' or '}'.");
break;
}
}
state = State.STOP;
eventListener.endBibTeX();
return rst;
}
private int readValueAndWhitespaces(char[] data, int p, StringBuilder sb)
throws BibTeXFormatException
{
int p0 = p;
while (p < data.length && Character.isWhitespace((int) data[p]))
++p;
if (data[p] == '"')
{
++p;
int mode = 0;
while (p < data.length && !(mode == 0 && data[p] == '"'))
{
switch (mode)
{
case 0:
if (data[p] == '\\')
mode = 1;
else
sb.append(data[p]);
break;
case 1:
if (data[p] == '"' || data[p] == '\\')
sb.append(data[p]);
else
sb.append('\\').append(data[p]);
mode = 0;
break;
}
++p;
}
if (mode != 0 || p >= data.length)
throw new BibTeXFormatException(data, p, "unclosed quotes.");
}
else if (data[p] == '{')
{
++p;
int count = 1;
int mode = 0;
while (p < data.length && count > 0)
{
switch (mode)
{
case 0:
if (data[p] == '\\')
mode = 1;
else
{
if (data[p] == '{')
++count;
else if (data[p] == '}')
--count;
if (count > 0)
sb.append(data[p]);
}
break;
case 1:
if (data[p] == '{' || data[p] == '}' || data[p] == '\\')
sb.append(data[p]);
else
sb.append('\\').append(data[p]);
mode = 0;
break;
}
++p;
}
--p;
if (count > 0)
throw new BibTeXFormatException(data, p, "unclosed brackets.");
}
else
{
// numbers or proper nouns
while (p < data.length && !Character.isWhitespace((int) data[p]) && data[p] != ',')
{
sb.append(data[p]);
++p;
}
--p;
}
++p; // skip the closing " or }
while (p < data.length && Character.isWhitespace((int) data[p]))
++p;
return p - p0;
}
void testReadValue() throws BibTeXFormatException
{
String[] tests =
{ "\"\"", "\"abc\"", "\"ab\\\"c\"", "\"ab\\\\\\\"c\"", "{}", "{abc}", "{ab\\{c}",
"{ab\\{abc\\}}", "{ab{abc}}", "1234" };
for (String test : tests)
{
StringBuilder sb = new StringBuilder();
int d = readValueAndWhitespaces(test.toCharArray(), 0, sb);
System.out.format("%d: %s\n", d, sb.toString());
}
}
void testParser1() throws BibTeXFormatException, SIMPLTranslationException
{
String data = " @inproceedings { article1 , author = \"Author 1\" , title = {TITLE 1} } @inproceedings { article2, author = \"Somebody\", sometag={some value}} ";
List<BibTeXInProceedings> entities = parse(data.toCharArray());
for (BibTeXInProceedings entity : entities)
{
SimplTypesScope.serialize(entity, System.out, StringFormat.XML);
}
}
void testParser2() throws IOException, BibTeXFormatException, SIMPLTranslationException
{
FileReader fr = new FileReader("C:/tmp/iis10.bib");
StringBuilder sb = new StringBuilder();
char[] buf = new char[4096];
while (true)
{
int len = fr.read(buf);
if (len < 0)
break;
sb.append(buf, 0, len);
}
List<AbstractBibTeXEntry> entities = parse(sb.toString().toCharArray());
for (AbstractBibTeXEntry entity : entities)
{
SimplTypesScope.serialize(entity, System.out, StringFormat.XML);
System.out.println();
// entity.serialize(System.out, FORMAT.BIBTEX);
}
}
// public static void main(String[] args) throws BibTeXFormatException, SIMPLTranslationException,
// IOException
// {
// BibTeXEvents listener = new ElementStateBibTeXHandler(BibTeXEntryTranslationScope.get());
// BibTeXParser parser = new BibTeXParser(listener);
// // parser.testReadValue();
// // parser.testParser1();
// parser.testParser2();
// }
}