package edu.isi.bmkeg.lapdf.xml;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import edu.isi.bmkeg.lapdf.model.Block;
import edu.isi.bmkeg.lapdf.model.ChunkBlock;
import edu.isi.bmkeg.lapdf.model.LapdfDocument;
import edu.isi.bmkeg.lapdf.model.PageBlock;
import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering;
public class OpenAccessXMLWriter implements XMLWriter {
public static final String ENCODING = "UTF-8";
public static final String ELEMENT_NAME_SECTION = "sec";
public static final String SECTION_ATTRIBUTE_SEC_TYPE = "sec-type";
public static final String ELEMENT_NAME_TITLE = "title";
public static final String ELEMENT_NAME_PARAGRAPH = "p";
public static final String ELEMENT_NAME_ARTICLE = "article";
public static final String ELEMENT_NAME_BODY = "body";
public static final String ELEMENT_NAME_ABSTRACT = "abstract";
public static final String ELEMENT_NAME_FRONT = "front";
public static final String ELEMENT_NAME_ARTICLE_META = "article-meta";
public static final String ELEMENT_NAME_ACKNOWLEDGEMENT = "ack";
public static final String ELEMENT_NAME_BACK = "back";
public static final String ELEMENT_NAME_REFERENCES = "ref";
public static final AttributesImpl emptyAttribute = new AttributesImpl();
public HashMap<String, ArrayList<ChunkBlock>> mappedDocument = new HashMap<String, ArrayList<ChunkBlock>>();
@Override
public void write(LapdfDocument document, String outputFilename) {
try {
FileOutputStream XMLOutputFileStream;
XMLOutputFileStream = new FileOutputStream(outputFilename);
OutputFormat XMLOutputFormat = new OutputFormat("XML", ENCODING,
false);
XMLSerializer serializer = new XMLSerializer(XMLOutputFileStream,
XMLOutputFormat);
ContentHandler documentContentHandler = serializer
.asContentHandler();
populateMap(document);
documentContentHandler.startDocument();
documentContentHandler.startElement("", "", ELEMENT_NAME_ARTICLE,
emptyAttribute);
documentContentHandler.startElement("", "", ELEMENT_NAME_FRONT,
emptyAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_ARTICLE_META, emptyAttribute);
doAbstractAndAcknowledgment(Block.TYPE_ABSTRACT,
documentContentHandler);
documentContentHandler
.endElement("", "", ELEMENT_NAME_ARTICLE_META);
documentContentHandler.endElement("", "", ELEMENT_NAME_FRONT);
documentContentHandler.startElement("", "", ELEMENT_NAME_BODY,
emptyAttribute);
doSection(Block.TYPE_INTRODUCTION, documentContentHandler, document);
doSection(Block.TYPE_METHODS, documentContentHandler, document);
doSection(Block.TYPE_DISCUSSION, documentContentHandler, document);
doSection(Block.TYPE_RESULTS, documentContentHandler, document);
doSection(Block.TYPE_CONCLUSIONS, documentContentHandler, document);
documentContentHandler.endElement("", "", ELEMENT_NAME_BODY);
documentContentHandler.startElement("", "", ELEMENT_NAME_BACK,
emptyAttribute);
doAbstractAndAcknowledgment(Block.TYPE_ACKNOWLEDGEMENTS,
documentContentHandler);
doReferences(documentContentHandler);
documentContentHandler.endElement("", "", ELEMENT_NAME_BACK);
documentContentHandler.endElement("", "", ELEMENT_NAME_ARTICLE);
documentContentHandler.endDocument();
XMLOutputFileStream.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private void doSection(String type, ContentHandler documentContentHandler,
LapdfDocument doc) {
ArrayList<ChunkBlock> list = mappedDocument.get(type);
if (list != null && list.size() > 0)
writeSection(documentContentHandler, list);
}
private void writeSection(ContentHandler documentContentHandler,
ArrayList<ChunkBlock> chunkList) {
Collections.sort(chunkList, new SpatialOrdering(
SpatialOrdering.PAGE_COLUMN_AWARE_MIXED_MODE));
ArrayList<ChunkBlock> headingList = new ArrayList<ChunkBlock>();
String lastEncounteredType = "";
String chunkText = null;
boolean hasSectionStarted = false;
try {
for (ChunkBlock chunk : chunkList) {
if (chunk.getType().contains(Block.META_TYPE_HEADING)) {
headingList.add(chunk);
}
if (headingList.size() > 0) {
chunkText = createStringFromChunk(headingList);
String openAccessType = createSectionType(headingList
.get(0));
AttributesImpl sectionAttribute = new AttributesImpl();
sectionAttribute
.addAttribute("", "", SECTION_ATTRIBUTE_SEC_TYPE,
"CDATA", openAccessType);
documentContentHandler.startElement("", "",
ELEMENT_NAME_SECTION, sectionAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_TITLE, null);
documentContentHandler.characters(chunkText.toCharArray(),
0, chunkText.toCharArray().length);
documentContentHandler.endElement("", "",
ELEMENT_NAME_TITLE);
chunkList.removeAll(headingList);
headingList.clear();
} else {
documentContentHandler.startElement("", "",
ELEMENT_NAME_SECTION, emptyAttribute);
}
break;
}
StringBuilder builder=new StringBuilder();
for (ChunkBlock chunk : chunkList) {
if (chunk.getType().contains(Block.META_TYPE_SUBTITLE)) {
headingList.add(chunk);
} else {
if (headingList.size() > 0) {
if (hasSectionStarted)
documentContentHandler.endElement("", "",
ELEMENT_NAME_SECTION);
else
hasSectionStarted = true;
chunkText = createStringFromChunk(headingList);
headingList.clear();
documentContentHandler.startElement("", "",
ELEMENT_NAME_SECTION, emptyAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_TITLE, emptyAttribute);
documentContentHandler.characters(chunkText
.toCharArray(), 0,
chunkText.toCharArray().length);
documentContentHandler.endElement("", "",
ELEMENT_NAME_TITLE);
}
chunkText = chunk.readChunkText();
if(chunkText.indexOf("-")==chunkText.length()-1){
builder.append(chunkText.substring(0,chunkText.length()-1));
}else{
builder.append(chunkText.substring(0,chunkText.length()));
documentContentHandler.startElement("", "",
ELEMENT_NAME_PARAGRAPH, emptyAttribute);
documentContentHandler.characters(builder.toString().toCharArray(),
0, builder.toString().toCharArray().length);
documentContentHandler.endElement("", "",
ELEMENT_NAME_PARAGRAPH);
builder.delete(0, builder.length());
}
}
}
if (hasSectionStarted)
documentContentHandler.endElement("", "", ELEMENT_NAME_SECTION);
documentContentHandler.endElement("", "", ELEMENT_NAME_SECTION);
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private String createSectionType(ChunkBlock chunk) {
if (chunk.getType().contains(Block.TYPE_INTRODUCTION)) {
return "intro";
} else if (chunk.getType().contains(Block.TYPE_METHODS)) {
return "materials|methods";
} else if (chunk.getType().contains(Block.TYPE_DISCUSSION)) {
return "discussion";
} else if (chunk.getType().contains(Block.TYPE_CONCLUSIONS)) {
return "conclusions";
} else if (chunk.getType().contains(Block.TYPE_RESULTS)) {
return "results";
}
return null;
}
private String createStringFromChunk(ArrayList<ChunkBlock> chunkList) {
String returnString = "";
for (ChunkBlock chunk : chunkList)
returnString = returnString + chunk.readChunkText().trim();
return returnString.trim();
}
private void doAbstractAndAcknowledgment(String type,
ContentHandler documentContentHandler) {
ArrayList<ChunkBlock> list;
list = mappedDocument.get(type);
if(list==null ||list.size()==0)
return;
ArrayList<ChunkBlock> headingList = new ArrayList<ChunkBlock>();
String chunkText = null;
boolean hasSectionStarted = false;
String element;
if (Block.TYPE_ABSTRACT.equals(type)) {
element = ELEMENT_NAME_ABSTRACT;
} else {
element = ELEMENT_NAME_ACKNOWLEDGEMENT;
}
if (list != null && list.size() > 0) {
Collections.sort(list, new SpatialOrdering(
SpatialOrdering.PAGE_COLUMN_AWARE_MIXED_MODE));
try {
documentContentHandler.startElement("", "", element,
emptyAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_SECTION, emptyAttribute);
StringBuilder builder=new StringBuilder();
for (ChunkBlock chunk : list) {
if (chunk.getType().contains(Block.META_TYPE_SUBTITLE)) {
headingList.add(chunk);
} else {
if (headingList.size() > 0) {
if (hasSectionStarted)
documentContentHandler.endElement("", "",
ELEMENT_NAME_SECTION);
else
hasSectionStarted = true;
chunkText = createStringFromChunk(headingList);
headingList.clear();
documentContentHandler.startElement("", "",
ELEMENT_NAME_SECTION, emptyAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_TITLE, emptyAttribute);
documentContentHandler.characters(chunkText
.toCharArray(), 0,
chunkText.toCharArray().length);
documentContentHandler.endElement("", "",
ELEMENT_NAME_TITLE);
}
chunkText = chunk.readChunkText();
if(chunkText.indexOf("-")==chunkText.length()-1){
builder.append(chunkText.substring(0,chunkText.length()-1));
}else{
builder.append(chunkText.substring(0,chunkText.length()));
documentContentHandler.startElement("", "",
ELEMENT_NAME_PARAGRAPH, emptyAttribute);
documentContentHandler.characters(builder.toString().toCharArray(),
0, builder.toString().toCharArray().length);
documentContentHandler.endElement("", "",
ELEMENT_NAME_PARAGRAPH);
builder.delete(0, builder.length());
}
}
}
if (hasSectionStarted)
documentContentHandler.endElement("", "",
ELEMENT_NAME_SECTION);
documentContentHandler.endElement("", "", ELEMENT_NAME_SECTION);
documentContentHandler.endElement("", "", element);
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
private void populateMap(LapdfDocument document) {
int totalNumberOfPages = document.getTotalNumberOfPages();
PageBlock page;
List<ChunkBlock> chunks;
ArrayList<ChunkBlock> chunkList;
for (int i = 1; i <= totalNumberOfPages; i++) {
page = document.getPage(i);
chunks = page.getAllChunkBlocks(SpatialOrdering.COLUMN_AWARE_MIXED_MODE);
for (ChunkBlock chunk : chunks) {
String type = (chunk.getType().contains(".")) ? chunk.getType()
.substring(0, chunk.getType().indexOf(".")) : chunk
.getType();
chunkList = mappedDocument.get(type);
if (chunkList == null) {
chunkList = new ArrayList<ChunkBlock>();
chunkList.add(chunk);
mappedDocument.put(type, chunkList);
} else {
chunkList.add(chunk);
}
}
}
}
private void doReferences(ContentHandler documentContentHandler) {
StringBuilder builder = new StringBuilder();
ArrayList<ChunkBlock> list;
list = mappedDocument.get(Block.TYPE_REFERENCES);
if(list==null ||list.size()==0)
return;
String chunkText;
for (ChunkBlock chunk : list) {
if (!chunk.getType().contains(Block.META_TYPE_HEADING)) {
builder.append(chunk.readChunkText());
}
}
try {
documentContentHandler.startElement("", "",
ELEMENT_NAME_REFERENCES, emptyAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_SECTION, emptyAttribute);
documentContentHandler.startElement("", "",
ELEMENT_NAME_PARAGRAPH, emptyAttribute);
documentContentHandler.characters(builder.toString().toCharArray(),
0, builder.toString().toCharArray().length);
documentContentHandler.endElement("", "", ELEMENT_NAME_PARAGRAPH);
documentContentHandler.endElement("", "", ELEMENT_NAME_SECTION);
documentContentHandler.endElement("", "", ELEMENT_NAME_REFERENCES);
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}