package edu.cmu.geolocator.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import edu.cmu.geolocator.model.ACE_NETag;
import edu.cmu.geolocator.model.Document;
import edu.cmu.geolocator.model.Paragraph;
import edu.cmu.geolocator.model.Sentence;
import edu.cmu.geolocator.model.TagDocument;
import edu.cmu.geolocator.model.Token;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.util.CoreMap;
public class ACEImporter {
public HashMap<String, Document> sDocs;
public HashMap<String, TagDocument> tagDoc;
public HashMap<String, Document> getsDocs() {
return sDocs;
}
public void setsDocs(HashMap<String, Document> sDocs) {
this.sDocs = sDocs;
}
public HashMap<String, TagDocument> getTagDoc() {
return tagDoc;
}
public void setTagDoc(HashMap<String, TagDocument> tagDoc) {
this.tagDoc = tagDoc;
}
public ACEImporter(String filename) {
sDocs = new HashMap<String, Document>();
tagDoc = new HashMap<String, TagDocument>();
try {
importDocs(new File(filename));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
align();
}
void align() {
Iterator iter1 = sDocs.entrySet().iterator();
int i = 0;
while (iter1.hasNext()) {
Map.Entry entry = (Map.Entry) iter1.next();
Object key = entry.getKey();
// System.out.println(key);
Object val = entry.getValue();
Document sgmDoc = (Document) val;
// sgmDoc.setDid(did);;
// System.out.println(sgmDoc.getDid());
ArrayList<Paragraph> paras = sgmDoc.getP();
// System.out.print(para.getParagraphString());
String myPara = paras.get(0).getParagraphString();
ArrayList<Sentence> sents = new ArrayList<Sentence>();
PipeLineAnnotate pla = new PipeLineAnnotate(myPara);
List<CoreMap> NLPsents = pla.getSentences();
for (CoreMap NLPsentence : NLPsents) {
Sentence mysentence = new Sentence(
NLPsentence.get(TextAnnotation.class));
sents.add(mysentence);
List<CoreLabel> tokens = NLPsentence
.get(TokensAnnotation.class);
int j = 0;
Token[] tokensArray = new Token[tokens.size()];
for (CoreLabel token : tokens) {
int pos = token.beginPosition();
int end = token.endPosition();
String word = token.get(TextAnnotation.class);
String ne = token.get(NamedEntityTagAnnotation.class);
// this is the POS tag of the token
// String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
Token myToken = new Token(word, sgmDoc.getDid() + "j", j);
myToken.setLemma(token.lemma());
myToken.setNE(ne);
myToken.setStart(pos + paras.get(0).getParaStart() - 1);
myToken.setEnd(end + paras.get(0).getParaStart() - 1);
tokensArray[j] = myToken;
j++;
}
mysentence.setTokens(tokensArray);
mysentence.setStart(tokensArray[0].getStart());
mysentence.setEnd(tokensArray[tokensArray.length - 1].getEnd());
TagDocument tagZDocument = tagDoc.get(sgmDoc.getDid());
ArrayList<ACE_NETag> ACETags = tagZDocument.getTags();
for (ACE_NETag ACEtag : ACETags) {
// System.out.println(ACEtag.getPhrase()+" "+ACEtag.getStart()+
// " " +ACEtag.getEnd());
if (ACEtag.getStart() >= mysentence.getStart()
&& ACEtag.getEnd() <= mysentence.getEnd()) {
Token[] tokenCompare = mysentence.getTokens();
for (int k = 0; k < tokenCompare.length; k++) {
if (ACEtag.getStart() <= tokenCompare[k].getStart()
&& tokenCompare[k].getEnd() <= ACEtag
.getEnd() + 1) {
/*
* System.out.println(tokenCompare[k].getToken()
* + " " + tokenCompare[k].getStart() + " " +
* tokenCompare[k].getEnd()+ " "
* +ACEtag.getStart()+ " " +ACEtag.getEnd()) ;
*/
tokenCompare[k].setNE(ACEtag.getCoarseNEType());
System.out.println(tokenCompare[k].getToken()
+" "+ tokenCompare[k].getNE());
}
}
}
}
}
paras.get(0).setSentences(sents);
// System.out.println("cx");
// pla.prettyPrint();
// ArrayList<Sentence> sents = para.getSentences();
/*
* BreakIterator boundary =
* BreakIterator.getSentenceInstance(Locale.US);
* boundary.setText(myPara); int start = boundary.first(); for (int
* end = boundary.next(); end != BreakIterator.DONE; start = end,
* end = boundary.next()){ String sentence =
* myPara.substring(start,end); //sents.add(sentence);
* System.out.println(sentence); }
*/
}
}
public static void main(String argb[]) throws IOException {
ACEImporter importer = new ACEImporter(
"E:\\chenxu\\cmu\\IEEE paper data\\machine translation data\\Chinese\\bn\\adj");
for (Entry<String, Document> e : importer.sDocs.entrySet()) {
if (e.getKey() == null)
continue;
// System.out.println(e.getKey());
ArrayList<Paragraph> paras = e.getValue().getP();
for (Paragraph para : paras) {
// System.out.println(para.getParagraphString());
// System.out.println(para.getParaStart());
}
}
for (Entry<String, TagDocument> e : importer.tagDoc.entrySet()) {
if (e.getKey() == null)
continue;
ArrayList<ACE_NETag> a = e.getValue().getTags();
Document doc = importer.sDocs.get(e.getKey());
// System.out.println(e.getKey());
for (Paragraph p : doc.getP()) {
String paraString = p.getParagraphString();
int start = p.getParaStart();
int end = start + paraString.length();
/*
* for (ACE_NETag tag : a) if (tag.getStart() >= start &&
* tag.getEnd() <= end) System.out.println(tag.getStart() + " "
* + tag.getEnd() + " " + tag.getCoarseNEType() + " " +
* paraString.substring(tag.getStart() - start, tag.getEnd() -
* start + 2));
*/
}
}
}
void importDocs(File node) throws IOException {
if (node.isDirectory()) {
String[] subNote = node.list();
for (String filename : subNote) {
importDocs(new File(node, filename));
}
} else {
if (node.isFile() && node.getAbsolutePath().endsWith(".sgm")) {
Document doc = new Document();
fillACEDoc(doc, node.getAbsoluteFile());
sDocs.put(doc.getDid(), doc);
}
if (node.isFile() && node.getAbsolutePath().endsWith(".apf.xml")) {
TagDocument doc = new TagDocument();
fillACETagDoc(doc, node.getAbsoluteFile());
tagDoc.put(doc.getDid(), doc);
}
}
}
private void fillACETagDoc(TagDocument doc, File absoluteFile)
throws IOException {
BufferedReader br = new BufferedReader(new FileReader(absoluteFile));
String line = null;
String etype = null, esubtype = null;
boolean b_mention = false, b_head = false;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.startsWith("<document "))
doc.setDid(line.split("\"")[1]);
else if (line.startsWith("<entity ")) {
String[] tokens = line.split(" ");
etype = tokens[2];
esubtype = tokens[3];
b_mention = false;
} else if (line.startsWith("</entity ")) {
etype = null;
esubtype = null;
} else if (line.startsWith("<entity_mention ")) {
// String _type = line.split(" ")[2]; System.out.println(_type);
if (line.split(" ")[2].equals("TYPE=\"NAM\""))
b_mention = true;
} else if (line.startsWith("</entity_mention>")) {
b_mention = false;
} else if (line.startsWith("<head>")) {
b_head = true;
} else if (line.startsWith("</head>")) {
b_head = false;
} else if (line.startsWith("<charseq ") && b_head == true
&& b_mention == true) {
String[] tokens = line.split(">");
String mention = tokens[1].split("<")[0];
String[] nums = tokens[0].split(" ");
String start = nums[1].split("=\"")[1];
start = start.substring(0, start.length() - 1);
String end = nums[2].split("=\"")[1];
end = end.substring(0, end.length() - 1);
ACE_NETag tag = new ACE_NETag(mention, Integer.parseInt(start),
Integer.parseInt(end), etype, esubtype);
doc.addTag(tag);
}
}
}
private void fillACEDoc(Document doc, File file) throws IOException {
@SuppressWarnings("resource")
BufferedReader br = new BufferedReader(new FileReader(file));
String line = null;
int lcount = 0;
String headline = "";
StringBuilder paraString = new StringBuilder();
boolean b_hline = false, b_content = false;
ArrayList<Paragraph> paras = new ArrayList<Paragraph>();
Paragraph p = null;
while ((line = br.readLine()) != null) {
if (line.startsWith("<DOC>") || line.startsWith("</DOC>"))
lcount++;
else if (line.startsWith("<DOCID>")) {
String id = line.split(">")[1].split("<")[0];
lcount += id.length() + 1;
doc.setDid(id.trim());
} else if (line.startsWith("<DOCTYPE")) {
String type = line.split(">")[1].split("<")[0];
lcount += type.length() + 1;
} else if (line.startsWith("<DATETIME>")) {
lcount += line.length() - 21 + 1;
} else if (line.startsWith("<BODY>") || line.startsWith("</BODY>")) {
lcount++;
} else if (line.startsWith("<HEADLINE>")) {
lcount++;
b_hline = true;
} else if (line.startsWith("</HEADLINE>")) {
lcount++;
b_hline = false;
doc.setHeadline(headline);
} else if (line.startsWith("<TEXT>")) {
p = new Paragraph();
p.setParaStart(lcount);
lcount++;
b_content = true;
paraString.append("\n");
} else if (line.startsWith("</TEXT>")) {
lcount++;
b_content = false;
p.setParagraphString(paraString.toString());
paraString = new StringBuilder();
paras.add(p);
} else if (line.startsWith("<TURN>")) {
lcount++;
paraString.append("\n");
} else if (line.startsWith("</TURN>")) {
lcount++;
paraString.append("\n");
} else if (line.startsWith("<SPEAKER>")) {
lcount += line.length() - 19 + 1;
paraString.append(line.split(">")[1].split("<")[0])
.append("\n");
} else if (b_content == true) {
if (paraString.toString().length() == 0) {
paraString.append(line);
} else {
paraString.append(" ").append(line);
}
lcount = line.length() + 1;
} else if (b_hline == true) {
doc.setHeadlineStart(lcount);
lcount += line.length() + 1;
headline = line;
} else
lcount++;
}
doc.setP(paras);
}
}