package edu.stanford.nlp.international.arabic.process; import edu.stanford.nlp.util.logging.Redwood; import java.io.BufferedReader; import java.io.IOException; import java.io.PrintWriter; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.objectbank.IteratorFromReaderFactory; import edu.stanford.nlp.objectbank.LineIterator; import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.process.SerializableFunction; import edu.stanford.nlp.sequences.DocumentReaderAndWriter; import edu.stanford.nlp.sequences.SeqClassifierFlags; /** * Reads newline delimited UTF-8 Arabic sentences with or without * gold segmentation markers. When segmentation markers are present, * this class may be used for * * @author Spence Green */ public class ArabicDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> { /** A logger for this class */ private static Redwood.RedwoodChannels log = Redwood.channels(ArabicDocumentReaderAndWriter.class); private static final long serialVersionUID = 3667837672769424178L; private final IteratorFromReaderFactory<List<CoreLabel>> factory; private final TokenizerFactory<CoreLabel> tf; // The segmentation marker used in the ATBv3 training data. private static final Character DEFAULT_SEG_MARKER = '-'; private final Character segMarker; // TODO(spenceg): Make this configurable. private static final String tagDelimiter = "|||"; private static final String rewriteDelimiter = ">>>"; private final boolean inputHasTags; private final boolean inputHasDomainLabels; private final String inputDomain; private final boolean shouldStripRewrites; public static class RewrittenArabicAnnotation implements CoreAnnotation<String> { public Class<String> getType() { return String.class; } } /** * * @param hasSegMarkers if true, input has segmentation markers */ public ArabicDocumentReaderAndWriter(boolean hasSegMarkers) { this(hasSegMarkers, null); } /** * * @param hasSegMarkers if true, input has segmentation markers * @param tokFactory a TokenizerFactory for the input */ public ArabicDocumentReaderAndWriter(boolean hasSegMarkers, TokenizerFactory<CoreLabel> tokFactory) { this(hasSegMarkers, false, tokFactory); } /** * * @param hasSegMarkers if true, input has segmentation markers * @param hasTags if true, input has morphological analyses separated by tagDelimiter. * @param tokFactory a TokenizerFactory for the input */ public ArabicDocumentReaderAndWriter(boolean hasSegMarkers, boolean hasTags, TokenizerFactory<CoreLabel> tokFactory) { this(hasSegMarkers, hasTags, false, "123", tokFactory); } /** * * @param hasSegMarkers if true, input has segmentation markers * @param hasTags if true, input has morphological analyses separated by tagDelimiter. * @param hasDomainLabels if true, input has a whitespace-terminated domain at the beginning * of each line of text * @param tokFactory a TokenizerFactory for the input */ public ArabicDocumentReaderAndWriter(boolean hasSegMarkers, boolean hasTags, boolean hasDomainLabels, String domain, TokenizerFactory<CoreLabel> tokFactory) { this(hasSegMarkers, hasTags, hasDomainLabels, domain, false, tokFactory); } /** * * @param hasSegMarkers if true, input has segmentation markers * @param hasTags if true, input has morphological analyses separated by tagDelimiter. * @param hasDomainLabels if true, input has a whitespace-terminated domain at the beginning * of each line of text * @param stripRewrites if true, erase orthographical rewrites from the gold labels (for * comparison purposes) * @param tokFactory a TokenizerFactory for the input */ public ArabicDocumentReaderAndWriter(boolean hasSegMarkers, boolean hasTags, boolean hasDomainLabels, String domain, boolean stripRewrites, TokenizerFactory<CoreLabel> tokFactory) { tf = tokFactory; inputHasTags = hasTags; inputHasDomainLabels = hasDomainLabels; inputDomain = domain; shouldStripRewrites = stripRewrites; segMarker = hasSegMarkers ? DEFAULT_SEG_MARKER : null; factory = LineIterator.getFactory(new SerializableFunction<String, List<CoreLabel>>() { private static final long serialVersionUID = 5243251505653686497L; public List<CoreLabel> apply(String in) { List<CoreLabel> tokenList; String lineDomain = ""; if (inputHasDomainLabels) { String[] domainAndData = in.split("\\s+", 2); if (domainAndData.length < 2) { log.info("Missing domain label or text: "); log.info(in); } else { lineDomain = domainAndData[0]; in = domainAndData[1]; } } else { lineDomain = inputDomain; } if (inputHasTags) { String[] toks = in.split("\\s+"); List<CoreLabel> input = new ArrayList<>(toks.length); final String tagDelim = Pattern.quote(tagDelimiter); final String rewDelim = Pattern.quote(rewriteDelimiter); for (String wordTag : toks) { String[] wordTagPair = wordTag.split(tagDelim); assert wordTagPair.length == 2; String[] rewritePair = wordTagPair[0].split(rewDelim); assert rewritePair.length == 1 || rewritePair.length == 2; String raw = rewritePair[0]; String rewritten = raw; if (rewritePair.length == 2) rewritten = rewritePair[1]; CoreLabel cl = new CoreLabel(); if (tf != null) { List<CoreLabel> lexListRaw = tf.getTokenizer(new StringReader(raw)).tokenize(); List<CoreLabel> lexListRewritten = tf.getTokenizer(new StringReader(rewritten)).tokenize(); if (lexListRewritten.size() != lexListRaw.size()) { System.err.printf("%s: Different number of tokens in raw and rewritten: %s>>>%s%n", this.getClass().getName(), raw, rewritten); lexListRewritten = lexListRaw; } if (lexListRaw.size() == 0) { continue; } else if (lexListRaw.size() == 1) { raw = lexListRaw.get(0).value(); rewritten = lexListRewritten.get(0).value(); } else if (lexListRaw.size() > 1) { String secondWord = lexListRaw.get(1).value(); if (secondWord.equals(String.valueOf(segMarker))) { // Special case for the null marker in the vocalized section raw = lexListRaw.get(0).value() + segMarker; rewritten = lexListRewritten.get(0).value() + segMarker; } else { System.err.printf("%s: Raw token generates multiple segments: %s%n", this.getClass().getName(), raw); raw = lexListRaw.get(0).value(); rewritten = lexListRewritten.get(0).value(); } } } cl.setValue(raw); cl.setWord(raw); cl.setTag(wordTagPair[1]); cl.set(CoreAnnotations.DomainAnnotation.class, lineDomain); cl.set(RewrittenArabicAnnotation.class, rewritten); input.add(cl); } tokenList = IOBUtils.StringToIOB(input, segMarker, true, shouldStripRewrites); } else if (tf == null) { tokenList = IOBUtils.StringToIOB(in, segMarker); } else { List<CoreLabel> line = tf.getTokenizer(new StringReader(in)).tokenize(); tokenList = IOBUtils.StringToIOB(line, segMarker, false); } if (inputHasDomainLabels && !inputHasTags) IOBUtils.labelDomain(tokenList, lineDomain); else if (!inputHasDomainLabels) IOBUtils.labelDomain(tokenList, inputDomain); return tokenList; } }); } /** * Required, but unused. */ public void init(SeqClassifierFlags flags) {} /** * Iterate over an input document. */ public Iterator<List<CoreLabel>> getIterator(Reader r) { return factory.getIterator(r); } public void printAnswers(List<CoreLabel> doc, PrintWriter pw) { pw.println("Answer\tGoldAnswer\tCharacter"); for(CoreLabel word : doc) { pw.printf("%s\t%s\t%s%n", word.get(CoreAnnotations.AnswerAnnotation.class), word.get(CoreAnnotations.GoldAnswerAnnotation.class), word.get(CoreAnnotations.CharAnnotation.class)); } } /** * For debugging. * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { if (args.length != 1) { System.err.printf("Usage: java %s file > output%n", ArabicDocumentReaderAndWriter.class.getName()); System.exit(-1); } String fileName = args[0]; TokenizerFactory<CoreLabel> tokFactory = ArabicTokenizer.atbFactory(); String atbVocOptions = "removeProMarker,removeMorphMarker"; tokFactory.setOptions(atbVocOptions); BufferedReader reader = IOUtils.readerFromString(fileName); for (String line; (line = reader.readLine()) != null; ) { String[] toks = line.split("\\s+"); final String delim = Pattern.quote(tagDelimiter); boolean isStart = true; for (String wordTag : toks) { String[] wordTagPair = wordTag.split(delim); assert wordTagPair.length == 2; String word = wordTagPair[0]; if (tokFactory != null) { List<CoreLabel> lexList = tokFactory.getTokenizer(new StringReader(word)).tokenize(); if (lexList.size() == 0) { continue; } else if (lexList.size() == 1) { word = lexList.get(0).value(); } else if (lexList.size() > 1) { String secondWord = lexList.get(1).value(); if (secondWord.equals(String.valueOf(DEFAULT_SEG_MARKER))) { // Special case for the null marker in the vocalized section word = lexList.get(0).value() + String.valueOf(DEFAULT_SEG_MARKER); } else { System.err.printf("%s: Raw token generates multiple segments: %s%n", ArabicDocumentReaderAndWriter.class.getName(), word); word = lexList.get(0).value(); } } } if ( ! isStart ) System.out.print(" "); System.out.print(word); isStart = false; } System.out.println(); } // DocumentReaderAndWriter<CoreLabel> docReader = new ArabicDocumentReaderAndWriter(true, // true, // false, // tokFactory); // Iterator<List<CoreLabel>> itr = docReader.getIterator(new InputStreamReader(new FileInputStream(new File(fileName)))); // while(itr.hasNext()) { // List<CoreLabel> line = itr.next(); // System.out.println(Sentence.listToString(line)); // } } }