package de.berlin.hu.uima.cr.xml;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.u_compare.shared.semantic.NamedEntity;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import de.berlin.hu.types.PubmedDocument;
import de.berlin.hu.util.Constants;
public class PatentCorpusCollectionReader extends XMLCollectionReader {
@Override
protected List<File> getfiles(String inputDir) {
List<File> result = new ArrayList<File>();
File dir = new File(inputDir);
for (File subdir : dir.listFiles()) {
if (subdir.isDirectory()) {
for (File file : subdir.listFiles()) {
if (file.isFile() && "scrapbook.xml".equals(file.getName())) {
result.add(file);
break;
}
}
}
}
return result;
}
@Override
public void getNext(CAS aCAS) throws IOException, CollectionException {
Document document = getNextDocument();
JCas jcas = null;
try {
jcas = aCAS.getJCas();
} catch (CASException e) {
throw new CollectionException(e);
}
NodeList snippetlist = document.getElementsByTagName("snippet");
List<Node> snippetNodes = new ArrayList<Node>();
String text = "";
for (int i = 0; i < snippetlist.getLength(); i++) {
Node node = snippetlist.item(i);
text += node.getTextContent().trim() + "\n\n";
snippetNodes.add(node);
}
jcas.setDocumentText(text);
SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
srcDocInfo.setUri(document.getDocumentURI().replaceFirst("(/|\\\\)[^/\\\\]+$", ""));
srcDocInfo.setOffsetInSource(0);
srcDocInfo.setDocumentSize(text.length());
srcDocInfo.setLastSegment(hasNext());
srcDocInfo.addToIndexes();
PubmedDocument abstractAnnotation = new PubmedDocument(jcas);
abstractAnnotation.setBegin(0);
abstractAnnotation.setEnd(text.length());
abstractAnnotation.setPmid("");
abstractAnnotation.addToIndexes(jcas);
int offset = 0;
for (Node snippetNode : snippetNodes) {
NodeList childNodes = snippetNode.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node node = childNodes.item(i);
if ("ne".equals(node.getNodeName()) && !node.getTextContent().isEmpty()) {
String chemical = node.getTextContent();
Node chebiIdNode = node.getAttributes().getNamedItem("chebi-id");
if (chebiIdNode == null) continue;
String chebiID = chebiIdNode.getTextContent().replaceAll("W[^:]+:", "");
Matcher matcher = Pattern.compile(Pattern.quote(chemical)).matcher(text.substring(offset));
if (matcher.find()) {
int begin = matcher.start();
int end = matcher.end();
NamedEntity namedEntity = new NamedEntity(jcas);
namedEntity.setBegin(offset+begin);
namedEntity.setEnd(offset+end);
namedEntity.setConfidence(1.0);
namedEntity.setId("," + chebiID);
namedEntity.setSource(Constants.GOLDSTANDARD);
namedEntity.addToIndexes();
offset = end+1;
}
}
}
}
}
}