package edu.umd.hooka.corpora; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.StringReader; import java.util.Random; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import edu.umd.hooka.alignment.aer.ReferenceAlignment; import edu.umd.hooka.corpora.Chunk; import edu.umd.hooka.corpora.Language; import edu.umd.hooka.corpora.LanguagePair; import edu.umd.hooka.corpora.ParallelChunk; public class ParallelCorpusReader extends DefaultHandler { public interface PChunkCallback { void handlePChunk(ParallelChunk p); } static class ChunkSetCB implements PChunkCallback { ChunkSetCB(ParallelCorpusReader pcr) { pcr_ = pcr; } ParallelCorpusReader pcr_; public void handlePChunk(ParallelChunk p) { pcr_.resultChunk = p; } } private ParallelChunk resultChunk = null; public ParallelCorpusReader() { cb_ = new ChunkSetCB(this); try { sp = SAXParserFactory.newInstance().newSAXParser(); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Couldn't build XML parser"); } } PChunkCallback cb_; private ParallelCorpusReader(PChunkCallback cb) { cb_ = cb; try { sp = SAXParserFactory.newInstance().newSAXParser(); } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Failed " + e); } } SAXParser sp = null; public ParallelChunk parseString(String xml) { resultChunk = null; try { sp.parse(new InputSource(new StringReader(xml)), this); }catch(final SAXException se) { resultChunk = null; se.printStackTrace(); throw new RuntimeException("SaxE: " + se+"\n"+xml); }catch (final IOException ie) { resultChunk = null; ie.printStackTrace(); throw new RuntimeException("ioe: " + ie); } return resultChunk; } public static void parseXMLDocument(String file, PChunkCallback cb) { //get a factory ParallelCorpusReader pcr = new ParallelCorpusReader(cb); final SAXParserFactory spf = SAXParserFactory.newInstance(); try { //get a new instance of parser final SAXParser sp = spf.newSAXParser(); //parse the file and also register this class for call backs sp.parse(file, pcr); }catch(final SAXException se) { se.printStackTrace(); }catch(final ParserConfigurationException pce) { pce.printStackTrace(); }catch (final IOException ie) { ie.printStackTrace(); } } ParallelChunk pchunk = null; //Event Handlers public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws SAXException { //reset if(qName.equalsIgnoreCase("pchunk")) { pchunk = new ParallelChunk(); pchunk.setName(attributes.getValue("name")); } else if (qName.equalsIgnoreCase("s")) { lang = Language.languageForISO639_1(attributes.getValue("lang")); tempVal = new StringBuffer(); } else if (qName.equalsIgnoreCase("wordalignment")) { tempVal = new StringBuffer(); langpair = LanguagePair.languageForISO639_1Pair(attributes.getValue("langpair")); } else if (qName.equalsIgnoreCase("pdoc")) { docName = attributes.getValue("name"); } else { throw new SAXException("Unknown tag: " + qName); } } Language lang; LanguagePair langpair; StringBuffer tempVal; String docName; int pchunkCount = 0; int chunkCount = 0; int refAlignCount = 0; public void characters(char[] ch, int start, int length) throws SAXException { if (tempVal != null) tempVal.append(ch,start,length); } public void endElement(String uri, String localName, String qName) throws SAXException { if(qName.equalsIgnoreCase("pchunk")) { pchunkCount++; cb_.handlePChunk(pchunk); }else if (qName.equalsIgnoreCase("s")) { String s = tempVal.toString().trim(); if (s.length() == 0) { System.err.println(pchunk.getName() + ": Empty segment for lang=" + lang); } else { Chunk c = new Chunk(tempVal.toString().trim()); pchunk.addChunk(lang, c); chunkCount++; tempVal = null; } }else if (qName.equalsIgnoreCase("wordalignment")) { Chunk sc = pchunk.getChunk(langpair.getSource()); if (sc == null) throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getSource() + ". Note: manual word alignment data must follow the chunk data."); Chunk tc = pchunk.getChunk(langpair.getTarget()); if (tc == null) throw new RuntimeException("PChunk doesn't contain data for lang: " + langpair.getTarget() + ". Note: manual word alignment data must follow the chunk data."); ReferenceAlignment r = new ReferenceAlignment( sc.getLength(), tc.getLength()); r.addAlignmentPointsPharaoh(tempVal.toString().trim()); pchunk.addReferenceAlignment(langpair, r); refAlignCount++; tempVal = null; }else if (qName.equalsIgnoreCase("pdoc")) { System.err.println("Finished parsing document " + docName); System.err.println(" pchunks: " + pchunkCount); System.err.println(" chunks: " + chunkCount); System.err.println(" ref alignments: " + refAlignCount); }else { throw new SAXException("Unknown tag: " + qName); } } private static void convertToXMLDocument( String label, String ifile1, String ifile2, String afile1_2, String ofile, String oenc, String le, String lf, boolean readAlignments) { try { if (readAlignments) { if (afile1_2 == null || afile1_2.equals("")) throw new RuntimeException("I'm supposed to read alignments, but no alignment file is set!"); } else if (afile1_2 != null && !afile1_2.equals("")) throw new RuntimeException("I'm not set to read alignments, but an alignment file is set!"); BufferedReader r1 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile1), "UTF8")); BufferedReader r2 = new BufferedReader(new InputStreamReader(new FileInputStream(ifile2), "UTF8")); BufferedReader r1_2 = null; if (readAlignments) r1_2= new BufferedReader(new InputStreamReader(new FileInputStream(afile1_2), "UTF8")); OutputStreamWriter w1 = new OutputStreamWriter(new FileOutputStream(ofile), oenc); Language de = Language.languageForISO639_1(lf); Language en = Language.languageForISO639_1(le); LanguagePair ende = null; if (readAlignments) ende = LanguagePair.languageForISO639_1Pair(le + "-" + lf); System.err.println("Reading " + en + " from: " + ifile1); System.err.println("Reading " + de + " from: " + ifile2); if (readAlignments) System.err.println("Reading alignments (" + ende + ") from: " + afile1_2); BufferedWriter w = new BufferedWriter(w1); w.write("<?xml version=\"1.0\" encoding=\""+ w1.getEncoding() + "\"?>"); w.newLine(); int x = ifile1.lastIndexOf('/'); if (x < 0 || x >= ifile1.length()) x = 0; w.write("<pdoc name=\"" + ifile1.substring(x+1) + "\">"); w.newLine(); String e; int lc = 0; while ((e = r1.readLine()) != null) { lc += 1; String f = r2.readLine(); if (f == null) { System.err.println("WARNING: " + ifile2 + " has fewer lines than " + ifile1); break; } String a = null; if (readAlignments) { a = r1_2.readLine(); if (a==null) System.err.println(afile1_2 + " has fewer lines than corpora files -- dropping alignments for remaining sentences"); } Chunk ec = new Chunk(e); Chunk fc = new Chunk(f); String name = label + lc; ParallelChunk p = new ParallelChunk(); p.setName(name); p.addChunk(de, fc); p.addChunk(en, ec); if (a != null) { ReferenceAlignment ra = new ReferenceAlignment(ec.getLength(), fc.getLength()); try { ra.addAlignmentPointsPharaoh(a); p.addReferenceAlignment(ende, ra); } catch (RuntimeException re) { System.err.println("Couldn't set alignment points for sentence # " + lc); System.err.println(" " + en +": len=" + ec.getLength() + " words=" + ec); System.err.println(" " + de +": len=" + fc.getLength() + " words=" + fc); System.err.println(" " + ende + ": " + a); } } w.write(p.toXML()); } String t = r2.readLine(); if (t != null) System.err.println("WARNING: " + ifile2 + " has more lines than " + ifile1); w.write("</pdoc>"); System.out.println("Converted " + lc + " sentences"); w.newLine(); w.close(); r1.close(); r2.close(); if (readAlignments) r1_2.close(); } catch (Exception e) { e.printStackTrace(); } } public static void main(String[] args) { if (false) try { parseXMLDocument("/Users/redpony/bitexts/hansards.fr-en/hansards.fr-en.xml", new PChunkCallback() { Random r = new Random(1); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/tmp/bar.xml"), "UTF8")); public void handlePChunk(ParallelChunk p) { Language fr = Language.languageForISO639_1("fr"); Language en = Language.languageForISO639_1("en"); Chunk f = p.getChunk(fr); if (f == null) return; Chunk e = p.getChunk(en); if (e == null) return; float elen = e.getLength(); float flen = f.getLength(); if (elen > 40) return; if (flen > 40) return; float ra = elen / flen; if (ra > 1.3) return; try { if (r.nextDouble() > 0.15) return; br.write(p.toXML()); } catch (Exception e1) { e1.printStackTrace(); } } @Override public void finalize() { try { br.close(); } catch (Exception e){} } }); } catch (Exception e) { e.printStackTrace(); } if (true) convertToXMLDocument( "koen_jhu_", "/Users/redpony/bitexts/kkn-eng-alignments/kkn.utf8", "/Users/redpony/bitexts/kkn-eng-alignments/eng", "/Users/redpony/bitexts/kkn-eng-alignments/align", "/tmp/foo.xml", "utf8", "ko", "en", true); if (false) convertToXMLDocument( "eu+nc_", "/Users/redpony/bitexts/corpus.en", "/Users/redpony/bitexts/corpus.de", "", "/tmp/foo.xml", "utf8", "en", "de", false); } }