package arkref.ace; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import org.simpleframework.xml.Attribute; import org.simpleframework.xml.Element; import org.simpleframework.xml.ElementList; import org.simpleframework.xml.Root; import org.simpleframework.xml.Serializer; import org.simpleframework.xml.Text; import org.simpleframework.xml.core.Persister; import arkref.parsestuff.U; import com.aliasi.util.Strings; /** * A fairly thin wrapper around the APF XML data structures. * * http://www.ldc.upenn.edu/Catalog/docs/LDC2005T09/README * * also APF_V4_0_1.DTD though it's not super helpful * * The XML parser: http://simple.sourceforge.net/download/stream/doc/tutorial/tutorial.php */ public class AceDocument { public Document document; public String text; private Map<arkref.data.Mention, AceDocument.Mention> myMention2aceMention; public void freezeMyMentions() { assert myMention2aceMention==null : "freeze only once!"; myMention2aceMention = new HashMap(); for (Mention aceM : document.getMentions()) { if (aceM.myMention != null) myMention2aceMention.put(aceM.myMention, aceM); } } public AceDocument.Mention getAceMention(arkref.data.Mention myMention) { return myMention2aceMention.get(myMention); } public static AceDocument load(String path) throws Exception { String apfPath = path + "_APF.XML"; String textPath= path + ".txt"; AceDocument aceDoc = new AceDocument(); aceDoc.text = U.readFile(textPath); aceDoc.document = parseFile(apfPath); return aceDoc; } public static Document parseFile(String apfXmlFile) throws Exception { Serializer serializer = new Persister(); File source = new File(apfXmlFile); SourceFile sf = null; sf = serializer.read(SourceFile.class, source); for (Entity en : sf.document.entities ) { for (Mention m : en.mentions) { assert en.ID().replace("E","").equals(m.aceID.replaceFirst("-.*","")); m.entity = en; } } return sf.document; } public static void main(String args[]) throws Exception { for (String f : args) { Document d = parseFile(f); for (Entity en : d.entities ) { for (Mention m : en.mentions) { U.pl(m.aceID +" | "+m.ID()+" | "+m.head.charseq.text+" | "+m.extent.charseq.text); } } } } //////////// APF XML structures //////////// @Root(strict=false) public static class SourceFile { @Element(name="document") Document document; } @Root(strict=false) public static class Document { @ElementList(inline=true, entry="entity") List <Entity> entities; @Attribute(name="DOCID") String docid; public ArrayList<Mention> getMentions() { ArrayList <Mention> mentions = new ArrayList<Mention>(); for (Entity en : entities ) { for (Mention m : en.mentions) { mentions.add(m); } } return mentions; } } public static void mentionsHeadSort(List<Mention> mentions) { Collections.sort(mentions, new Comparator<AceDocument.Mention>() { public int compare(Mention m1, Mention m2) { return Integer.valueOf(m1.head.charseq.start).compareTo(m2.head.charseq.start); } }); } public static void mentionsExtentSort(List<Mention> mentions) { Collections.sort(mentions, new Comparator<AceDocument.Mention>() { public int compare(Mention m1, Mention m2) { return Integer.valueOf(m1.extent.charseq.start).compareTo(m2.extent.charseq.start); } }); } @Root(name="entity",strict=false) public static class Entity { @Attribute(name="ID") private String aceID; @ElementList(inline=true) List <Mention> mentions; public String ID() { return aceID.replaceFirst(".*-E", "E"); } public String toString() { return String.format("%-3s", ID()); } } @Root(name="entity_mention",strict=false) public static class Mention { @Attribute(name="ID") public String aceID; @Element public Phrase extent; @Element public Phrase head; public Entity entity; /** Convenience for later processing: the data.Mention this ACE mention corresponds to. **/ public arkref.data.Mention myMention = null; public int ID() { return Integer.parseInt(aceID.replaceFirst(".*-","")); } public boolean isSingleton() { assert entity.mentions.size() != 0; return entity.mentions.size() == 1; } public String toString() { if (myMention != null) { String ex = Strings.normalizeWhitespace(extent.charseq.text); String h = Strings.normalizeWhitespace(head.charseq.text); if (ex.equals(h)) return String.format("M%-2d <%s>", myMention.ID(), ex); else return String.format("M%-2d <%s | %s>", myMention.ID(), ex, h); } else { return String.format("AM%-2d | %s", ID(), Strings.normalizeWhitespace(extent.charseq.text)); } // return String.format("AM%-3d | %s | %s", ID(), // Strings.normalizeWhitespace(head.charseq.text), Strings.normalizeWhitespace(extent.charseq.text)); } } @Root(strict=false) public static class Phrase { @Element(name="charseq") public Charseq charseq; } @Root public static class Charseq { // these start and ends are consistent with one another, but it's a complete mystery what they're counting from // e.g. start=0 is a random-ass place in the SGML file. @Attribute(name="START") public int start; @Attribute(name="END") public int end; @Text public String text; } }