package edu.cmu.geolocator.io;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import edu.cmu.geolocator.model.ACE_NETag;
import edu.cmu.geolocator.model.Document;
import edu.cmu.geolocator.model.Paragraph;
import edu.cmu.geolocator.model.Sentence;
import edu.cmu.geolocator.model.TagDocument;
import edu.cmu.geolocator.model.Token;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.util.CoreMap;
public class CountACEImporter {
//public HashMap<String, Document> sDocs;
public HashMap<String, TagDocument> tagDoc;
public HashMap<String, TagDocument> getTagDoc() {
return tagDoc;
}
public void setTagDoc(HashMap<String, TagDocument> tagDoc) {
this.tagDoc = tagDoc;
}
public CountACEImporter(String filename) {
tagDoc = new HashMap<String, TagDocument>();
try {
importDocs(new File(filename));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
void align() {}
public static void main(String argb[]) throws IOException {
CountACEImporter importer = new CountACEImporter(
"C:\\chenxu\\cmu\\IEEE paper data\\parallel data\\LDC Data\\ACE 2005 Multilingual LDC2005E18\\ACE2005-TrainingData-V6.0\\English\\train");
int goldToponym = 0;
int facToponym = 0;
for (Entry<String, TagDocument> e : importer.tagDoc.entrySet()) {
if (e.getKey() == null)
continue;
ArrayList<ACE_NETag> a = e.getValue().getTags();
String FacFilename = null;
for (ACE_NETag tag : a){
if (tag.getCoarseNEType().equals("TYPE=\"GPE\"")||tag.getCoarseNEType().equals("TYPE=\"LOC\"")||tag.getCoarseNEType().equals("TYPE=\"FAC\"")){
//System.out.println(tag.getPhrase() + " "+tag.getCoarseNEType());
goldToponym++;
}
if (tag.getCoarseNEType().equals("TYPE=\"FAC\"")){
//System.out.println(tag.getPhrase() + " "+tag.getCoarseNEType());
FacFilename = e.getKey();
facToponym++;
}
}
if (FacFilename!=null)
FacFilename = FacFilename+".apf"+".xml";
//System.out.println(FacFilename);
String path = "C:\\chenxu\\cmu\\IEEE paper data\\parallel data\\LDC Data\\ACE 2005 Multilingual LDC2005E18\\ACE2005-TrainingData-V6.0\\English\\train";//文件夹的路径
File file = new File(path);
String[] files = file.list();
for(String f :files){
if (FacFilename != null && FacFilename.length()!=0&&f.equals(FacFilename)){
System.out.println(FacFilename);
CountACEImporter.copyFile(path+"\\"+FacFilename, "C:\\chenxu\\cmu\\IEEE paper data\\ACEFAC"+"\\"+FacFilename);
}
}
//File FacFile = new File(FacFilename);
}
//System.out.print(goldToponym);
//System.out.print(facToponym);
}
public static void copyFile(String oldPath, String newPath) {
try {
int bytesum = 0;
int byteread = 0;
File oldfile = new File(oldPath);
if (oldfile.exists()) { //文件存在时
InputStream inStream = new FileInputStream(oldPath); //读入原文件
FileOutputStream fs = new FileOutputStream(newPath);
byte[] buffer = new byte[1444];
int length;
while ( (byteread = inStream.read(buffer)) != -1) {
bytesum += byteread; //字节数 文件大小
//System.out.println(bytesum);
fs.write(buffer, 0, byteread);
}
inStream.close();
}
}
catch (Exception e) {
System.out.println("复制单个文件操作出错");
e.printStackTrace();
}
}
void importDocs(File node) throws IOException {
if (node.isDirectory()) {
String[] subNote = node.list();
for (String filename : subNote) {
importDocs(new File(node, filename));
}
} else {
if (node.isFile() && node.getAbsolutePath().endsWith(".apf.xml")) {
TagDocument doc = new TagDocument();
fillACETagDoc(doc, node.getAbsoluteFile());
tagDoc.put(doc.getDid(), doc);
}
}
}
private void fillACETagDoc(TagDocument doc, File absoluteFile)
throws IOException {
BufferedReader br = new BufferedReader(new FileReader(absoluteFile));
String line = null;
String etype = null, esubtype = null;
boolean b_mention = false, b_head = false;
while ((line = br.readLine()) != null) {
line = line.trim();
if (line.startsWith("<document "))
doc.setDid(line.split("\"")[1]);
else if (line.startsWith("<entity ")) {
String[] tokens = line.split(" ");
etype = tokens[2];
esubtype = tokens[3];
b_mention = false;
} else if (line.startsWith("</entity ")) {
etype = null;
esubtype = null;
} else if (line.startsWith("<entity_mention ")) {
// String _type = line.split(" ")[2]; System.out.println(_type);
if (line.split(" ")[2].equals("TYPE=\"NAM\""))
b_mention = true;
} else if (line.startsWith("</entity_mention>")) {
b_mention = false;
} else if (line.startsWith("<head>")) {
b_head = true;
} else if (line.startsWith("</head>")) {
b_head = false;
} else if (line.startsWith("<charseq ") && b_head == true
&& b_mention == true) {
String[] tokens = line.split(">");
String mention = tokens[1].split("<")[0];
String[] nums = tokens[0].split(" ");
String start = nums[1].split("=\"")[1];
start = start.substring(0, start.length() - 1);
String end = nums[2].split("=\"")[1];
end = end.substring(0, end.length() - 1);
ACE_NETag tag = new ACE_NETag(mention, Integer.parseInt(start),
Integer.parseInt(end), etype, esubtype);
doc.addTag(tag);
}
}
}
private void fillACEDoc(Document doc, File file) throws IOException {
@SuppressWarnings("resource")
BufferedReader br = new BufferedReader(new FileReader(file));
String line = null;
int lcount = 0;
String headline = "";
StringBuilder paraString = new StringBuilder();
boolean b_hline = false, b_content = false;
ArrayList<Paragraph> paras = new ArrayList<Paragraph>();
Paragraph p = null;
while ((line = br.readLine()) != null) {
if (line.startsWith("<DOC>") || line.startsWith("</DOC>"))
lcount++;
else if (line.startsWith("<DOCID>")) {
String id = line.split(">")[1].split("<")[0];
lcount += id.length() + 1;
doc.setDid(id.trim());
} else if (line.startsWith("<DOCTYPE")) {
String type = line.split(">")[1].split("<")[0];
lcount += type.length() + 1;
} else if (line.startsWith("<DATETIME>")) {
lcount += line.length() - 21 + 1;
} else if (line.startsWith("<BODY>") || line.startsWith("</BODY>")) {
lcount++;
} else if (line.startsWith("<HEADLINE>")) {
lcount++;
b_hline = true;
} else if (line.startsWith("</HEADLINE>")) {
lcount++;
b_hline = false;
doc.setHeadline(headline);
} else if (line.startsWith("<TEXT>")) {
p = new Paragraph();
p.setParaStart(lcount);
lcount++;
b_content = true;
paraString.append("\n");
} else if (line.startsWith("</TEXT>")) {
lcount++;
b_content = false;
p.setParagraphString(paraString.toString());
paraString = new StringBuilder();
paras.add(p);
} else if (line.startsWith("<TURN>")) {
lcount++;
paraString.append("\n");
} else if (line.startsWith("</TURN>")) {
lcount++;
paraString.append("\n");
} else if (line.startsWith("<SPEAKER>")) {
lcount += line.length() - 19 + 1;
paraString.append(line.split(">")[1].split("<")[0])
.append("\n");
} else if (b_content == true) {
if (paraString.toString().length() == 0) {
paraString.append(line);
} else {
paraString.append(" ").append(line);
}
lcount = line.length() + 1;
} else if (b_hline == true) {
doc.setHeadlineStart(lcount);
lcount += line.length() + 1;
headline = line;
} else
lcount++;
}
doc.setP(paras);
}
}