import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.util.*;
import edu.cmu.minorthird.util.gui.*;
import com.wcohen.ss.api.*;
import com.wcohen.ss.*;
import java.io.*;
import java.util.*;
/** Takes as input two directories, one containing documents with
* XML-based markup, and one with similar plain-text documents, and
* tries to import the XML markup to the plain-text documents.
*
* Specifically, this will try to align the each plain-text document
* from dirWithPlainText with a similarly-named marked-up document
* from the dirWithMarkup directory. ("Similarly-named" means that the
* .plainTextExtension, by default ".txt", is replaced with the
* markupExtension, by default ".xml"). Alignment is done with a fast
* (linear time and space) variant of NeedlemanWunch edit distance.
* If an alignment is found, then each marked-span in the marked-up
* document (i.e., each span that has been assigned a span type) will
* be aligned to the corresponding span in the plain-text document,
* and the span Type will be given to the corresponding span. The
* 'imported' labels for the documents in the dirWithPlainText
* directory will be written out to 'labelFile'.
*/
public class AlignMarkup
{
static private final boolean NO_ADJUSTMENT=false; // if true, suppress the 'adjustment' phase
static private boolean checkAlignments = true;
static public void main(String[] args) throws Exception
{
if (args.length<3) {
System.out.println("usage: dirWithMarkup dirWithPlainText labelFile [.plainTextExtension .markupExtension]");
System.exit(-1);
}
String dirWithMarkup = args[0];
String dirWithPlainText = args[1];
String labelFileName = args[2];
String markupExtension = args.length>3 ? args[3] : ".xml";
String plainTextExtension = args.length>4 ? args[4] : ".txt";
TextBaseLoader markupLoader = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE,TextBaseLoader.USE_XML);
TextBase markupBase = markupLoader.load(new File(dirWithMarkup));
TextLabels markupLabels = markupLoader.getLabels();
TextBaseLoader plainTextLoader = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE,TextBaseLoader.USE_XML);
TextBase plainTextBase = plainTextLoader.load(new File(dirWithPlainText));
MutableTextLabels plainTextLabels = new BasicTextLabels(plainTextBase);
alignMarkup(markupLabels,plainTextLabels,markupExtension,plainTextExtension);
new TextLabelsLoader().saveTypesAsOps(plainTextLabels,new File(labelFileName));
//new ViewerFrame("original markup",new SmartVanillaViewer(markupLabels));
//new ViewerFrame("imported markup",new SmartVanillaViewer(plainTextLabels));
}
static private void alignMarkup(TextLabels markupLabels,MutableTextLabels plainTextLabels,
String markupExtension,String plainTextExtension)
{
ApproxNeedlemanWunsch aligner = new ApproxNeedlemanWunsch(CharMatchScore.DIST_01, 1.0);
ApproxNeedlemanWunsch errorCounter = new ApproxNeedlemanWunsch(CharMatchScore.DIST_01, 1.0);
aligner.setWidth(200);
double totErrors = 0;
double totErrorDistance = 0;
double totDistance = 0;
double totAlignments = 0;
double totAdjustments = 0;
for (Span.Looper i=plainTextLabels.getTextBase().documentSpanIterator(); i.hasNext(); ) {
Span plainDocSpan = i.nextSpan();
String plainString = plainDocSpan.getDocumentContents();
String plainDocId = plainDocSpan.getDocumentId();
String markupDocId =
plainDocId.substring( 0, plainDocId.length()-plainTextExtension.length() ) + markupExtension;
System.out.print("aligning "+plainDocId+" to "+markupDocId);
Document markupDoc = markupLabels.getTextBase().getDocument(markupDocId);
if (markupDoc==null) {
//throw new IllegalStateException("can't find marked version of "+plainDocId);
System.out.println("WARNING: can't find marked version of "+plainDocId);
continue;
}
String markupString = markupDoc.getText();
System.out.print(" string lengths: "+plainString.length()+","+markupString.length());
long t0 = System.currentTimeMillis();
double score = aligner.score(markupString,plainString);
long tf = System.currentTimeMillis();
System.out.println(" score = "+score+" runtime = "+((tf-t0)/1000.0)+" sec");
//System.out.println( aligner.explainScore(markupString,plainString) );
for (Iterator j=markupLabels.getTypes().iterator(); j.hasNext(); ) {
String type = (String)j.next();
for (Span.Looper k=markupLabels.instanceIterator(type,markupDocId); k.hasNext(); ) {
Span markupSpan = k.nextSpan();
if (markupSpan.size()>0) {
int lo = markupSpan.getLoChar();
int hi = markupSpan.getHiChar();
// align first char of span to plaintext
int lo1 = aligner.getAlignedChar(lo,false);
// align last char of span to plaintext, add one for the limit
int hi1 = aligner.getAlignedChar(hi-1,true)+1;
if (lo1<0 || hi1<0 || lo1>plainString.length() || hi1>plainString.length() || lo1>=hi1) {
//System.out.println(type+" align failure: "+lo+","+hi+" => "+lo1+","+hi1);
totErrors++;
totErrorDistance += hi-lo;
totDistance += hi-lo;
} else {
Span plainSpan = plainDocSpan.charIndexSubSpan(lo1,hi1);
Alignment alignment = new Alignment(plainSpan,markupSpan,plainTextLabels,markupLabels);
totAdjustments += alignment.adjust();
alignment.commit(type);
totAlignments++;
//a check on quality
if (checkAlignments && !alignment.match()) {
//System.out.println(markupSpan+" aligned to "+plainSpan);
//System.out.println(type+" align: "+lo+","+hi+" => "+lo1+","+hi1);
//System.out.println("error? '"+markupString.substring(lo,hi)+"' => '"+plainString.substring(lo1,hi1)+"'");
//System.out.println("error? '"+markupSpan.asString()+"' => '"+plainSpan.asString()+"'");
totErrors++;
double errorDistance = -errorCounter.score( markupString.substring(lo,hi), plainString.substring(lo1,hi1) );
if (errorDistance>Math.max( hi-lo, hi1-lo1 )) {
//totErrorDistance += Math.max( hi-lo, hi1-lo1 );
System.out.println("WARNING: infinite error distance for possible mis-alignment?");
} else {
totErrorDistance += errorDistance;
//System.out.println("errorDistance: "+errorDistance+" totErrorDistance="+totErrorDistance);
}
} // end check
} // end alignment found
totDistance += Math.max( hi-lo, hi1-lo1 );
} // end if markupSpan.size>0
} // for span k of type j
plainTextLabels.closeTypeInside(type,plainDocSpan);
} // for type j
} // for document i
if (totAlignments>0) System.out.println("adjustments: "+totAdjustments+"/"+totAlignments+" = "+(totAdjustments/totAlignments));
if (totAlignments>0) System.out.println("alignment errors: "+totErrors+"/"+totAlignments+" = "+(totErrors/totAlignments));
if (totDistance>0) System.out.println("Error distance: "+totErrorDistance +"/"+ totDistance + " = "+(totErrorDistance/totDistance));
}
static private class Alignment
{
private static final int LO_DELTA1 = -3, LO_DELTA2 = +3;
private static final int LEN_DELTA1 = -3, LEN_DELTA2 = +3;
Span plainSpan;
final Span markupSpan;
final MutableTextLabels plainLabels;
final TextLabels markupLabels;
// save result of last doTokenMatch comparison
private Boolean priorResult = null;
public Alignment(Span plainSpan,Span markupSpan,MutableTextLabels plainLabels,TextLabels markupLabels)
{
this.plainSpan=plainSpan; this.markupSpan=markupSpan;
this.plainLabels=plainLabels; this.markupLabels=markupLabels;
}
/** Change the plainTextLabels by adding the plainSpan to the type */
public void commit(String type)
{
plainLabels.addToType(plainSpan,type);
}
/** Try and improve the local alignment by moving the
* boundaries of the plainText span by a token or so in either
* direction. Return 1 or 0, indicating if an adjustment was
* made. */
public int adjust()
{
if (NO_ADJUSTMENT) return 0;
if (markupSpanMatch(plainSpan)) {
priorResult = new Boolean(true);
return 0; // none necessary
}
//System.out.println("adjusting plainSpan to match "+markupSpan);
Span docSpan = plainSpan.documentSpan();
for (int lo=plainSpan.documentSpanStartIndex()+LO_DELTA1; lo<=plainSpan.documentSpanStartIndex()+LO_DELTA2; lo++) {
for (int len=plainSpan.size()+LEN_DELTA1; len<=plainSpan.size()+LEN_DELTA2; len++) {
if (lo>0 && lo+len<=docSpan.size()) {
//System.out.println("testing "+lo+":"+(lo+len));
Span adjustedPlainSpan = docSpan.subSpan( lo, len );
if (markupSpanMatch(adjustedPlainSpan)) {
//System.out.println("correcting plainSpan from "+plainSpan+" to "+adjustedPlainSpan);
plainSpan = adjustedPlainSpan;
priorResult = new Boolean(true);
return 1;
}
}
}
}
//if (!mungedTokens()) System.out.println("adjustment fails for "+plainSpan+"==>"+markupSpan);
priorResult = new Boolean(false); // no adjustment worked
return 0;
}
// plainText/markupText results match token-by-token
public boolean match()
{
if (priorResult==null) priorResult=new Boolean(markupSpanMatch(plainSpan));
return priorResult.booleanValue() || mungedTokens();
}
// a likely explanation for apparent mis-alignments
private boolean mungedTokens()
{
if (plainSpan.asString().indexOf("'t")>=0) return true;
if (markupSpan.asString().indexOf("-LBR-")>=0) return true;
if (markupSpan.asString().indexOf("-RBR-")>=0) return true;
if (markupSpan.asString().indexOf("--")>=0) return true;
return false;
}
private boolean markupSpanMatch(Span span)
{
// token match is hopeless for these...
boolean ok = true;
String[] markupToks = markupLabels.getTextBase().getTokenizer().splitIntoTokens(markupSpan.asString());
String[] plainToks = markupLabels.getTextBase().getTokenizer().splitIntoTokens(span.asString());
if (markupToks.length!=plainToks.length) {
ok = false;
} else {
for (int m=0; ok && m<markupToks.length; m++) {
if (!markupToks[m].equals(plainToks[m])) ok=false;
}
}
priorResult = new Boolean(ok);
return ok;
}
}
}