package de.berlin.hu.uima.cr.xml;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.u_compare.shared.semantic.NamedEntity;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.util.Constants;
public class NaCTeMCollectionReader extends XMLCollectionReader {
@Override
protected List<File> getfiles(String inputDir) {
List<File> result = new ArrayList<File>();
File dir = new File(inputDir);
for (File file : dir.listFiles()) {
if (file.isFile() && file.getName().endsWith(".xml")) {
result.add(file);
}
}
return result;
}
@Override
public void getNext(CAS aCAS) throws IOException, CollectionException {
Document document = getNextDocument();
JCas jcas = null;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
String pmid = document.getElementsByTagName("PMID").item(0).getTextContent();
Node titleNode = document.getElementsByTagName("ArticleTitle").item(0);
String title = titleNode != null ? document.getElementsByTagName("ArticleTitle").item(0).getTextContent() : "";
Node abstractNode = document.getElementsByTagName("AbstractText").item(0);
String abstr = abstractNode != null ? abstractNode.getTextContent() : "";
String text = title + "\n\n" + abstr;
jcas.setDocumentText(text);
SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
srcDocInfo.setUri(document.getDocumentURI().toString());
srcDocInfo.setOffsetInSource(0);
srcDocInfo.setDocumentSize(text.length());
srcDocInfo.setLastSegment(hasNext());
srcDocInfo.addToIndexes();
PubmedDocument abstractAnnotation = new PubmedDocument(jcas);
abstractAnnotation.setBegin(0);
abstractAnnotation.setEnd(text.length());
abstractAnnotation.setPmid(pmid);
abstractAnnotation.addToIndexes(jcas);
List<Node> nodes = new ArrayList<Node>();
if (titleNode != null) {
NodeList titleNodes = titleNode.getChildNodes();
for (int i = 0; i < titleNodes.getLength(); i++) {
nodes.add(titleNodes.item(i));
}
}
if (abstractNode != null) {
NodeList abstractNodes = abstractNode.getChildNodes();
for (int i = 0; i < abstractNodes.getLength(); i++) {
nodes.add(abstractNodes.item(i));
}
}
int offset = 0;
for (Node node : nodes) {
if (("METABOLITE".equals(node.getNodeName()) || "ENZYME".equals(node.getNodeName())) && !node.getTextContent().trim().isEmpty()) {
String chemical = node.getTextContent();
Matcher matcher = Pattern.compile(Pattern.quote(chemical)).matcher(text.substring(offset));
if (matcher.find()) {
int begin = matcher.start();
int end = matcher.end();
NamedEntity namedEntity = new NamedEntity(jcas);
namedEntity.setBegin(offset+begin);
namedEntity.setEnd(offset+end);
namedEntity.setConfidence(1.0);
namedEntity.setSource(Constants.GOLDSTANDARD);
namedEntity.addToIndexes();
offset = end+1;
}
}
}
/*Pattern chemicalPattern = Pattern.compile("<ENZYME>[^<]*((?:(?:</?METABOLITE>)?[^<]+)+)</ENZYME>|<METABOLITE>([^<]+)</METABOLITE>");
Matcher matcher = chemicalPattern.matcher(text);
while (matcher.find()) {
String match = null;
match = matcher.group(1) != null ? matcher.group(1) : matcher.group(2);
int begin = matcher.start();
int end = matcher.end();
match = match.replaceAll("<[^>]>", "");
text = text.substring(0, begin) + match + text.substring(end);
matcher = chemicalPattern.matcher(text);
}*/
}
}