/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.formats.ad; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.cogroo.util.StringsUtil; import opennlp.tools.formats.ad.ADSentenceStream; import opennlp.tools.formats.ad.ADSentenceStream.Sentence; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement; import opennlp.tools.postag.POSSample; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; /** * <b>Note:</b> Do not use this class, internal use only! */ public class ADExPOSSampleStream implements ObjectStream<POSSample> { private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream; private boolean expandME; private boolean isIncludeFeatures; private boolean additionalContext; // this is used to control changing aspas representation, some sentences we keep as original, others we change to " private int callsCount = 0; private static final Pattern hyphenPattern = Pattern.compile("((\\p{L}+)-$)|(^-(\\p{L}+)(.*))|((\\p{L}+)-(\\p{L}+)(.*))"); /** * Creates a new {@link POSSample} stream from a line stream, i.e. * {@link ObjectStream}< {@link String}>, that could be a * {@link PlainTextByLineStream} object. * * @param lineStream * a stream of lines as {@link String} * @param expandME * if true will expand the multiword expressions, each word of the * expression will have the POS Tag that was attributed to the * expression plus the prefix B- or I- (CONLL convention) * @param includeFeatures * if true will combine the POS Tag with the feature tags */ public ADExPOSSampleStream(ObjectStream<String> lineStream, boolean expandME, boolean includeFeatures, boolean additionalContext) { this.adSentenceStream = new ADSentenceStream(lineStream); this.expandME = expandME; this.isIncludeFeatures = includeFeatures; this.additionalContext = additionalContext; } public POSSample read() throws IOException { callsCount++; Sentence paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { Node root = paragraph.getRoot(); List<String> sentence = new ArrayList<String>(); List<String> tags = new ArrayList<String>(); List<String> contractions = new ArrayList<String>(); List<String> prop = new ArrayList<String>(); process(root, sentence, tags, contractions, prop); if (sentence.size() != contractions.size() || sentence.size() != prop.size()) { throw new IllegalArgumentException( "There must be exactly same number of tokens and additional context!"); } if(this.additionalContext) { String[][] ac = new String[2][sentence.size()]; // line 0: contractions // line 1: props for (int i = 0; i < sentence.size(); i++) { if (contractions.get(i) != null) { ac[0][i] = contractions.get(i); } if (prop.get(i) != null) { ac[1][i] = prop.get(i); } } // System.out.println(); return new POSSample(sentence, tags, ac); } else { return new POSSample(sentence, tags); } } return null; } private void process(Node node, List<String> sentence, List<String> tags, List<String> con, List<String> prop) { if (node != null) { for (TreeElement element : node.getElements()) { if (element.isLeaf()) { processLeaf((Leaf) element, sentence, tags, con, prop); } else { process((Node) element, sentence, tags, con, prop); } } } } private void processLeaf(Leaf leaf, List<String> sentence, List<String> tags, List<String> con, List<String> prop) { if (leaf != null) { String lexeme = leaf.getLexeme(); // this will change half of the aspas if("«".equals(lexeme) || "»".equals(lexeme)) { if(callsCount % 2 == 0) { lexeme = "\""; } } String tag = leaf.getFunctionalTag(); String contraction = null; if (leaf.getSecondaryTag() != null) { if (leaf.getSecondaryTag().contains("<sam->")) { contraction = "B"; } else if (leaf.getSecondaryTag().contains("<-sam>")) { contraction = "E"; } } if (tag == null) { tag = lexeme; } if (isIncludeFeatures && leaf.getMorphologicalTag() != null) { tag += " " + leaf.getMorphologicalTag(); } tag = tag.replaceAll("\\s+", "="); if (tag == null) tag = lexeme; if (expandME && lexeme.contains("_")) { StringTokenizer tokenizer = new StringTokenizer(lexeme, "_"); if ("prop".equals(tag)) { sentence.add(lexeme); tags.add(tag); con.add(null); prop.add("P"); } else if (tokenizer.countTokens() > 0) { List<String> toks = new ArrayList<String>(tokenizer.countTokens()); List<String> tagsWithCont = new ArrayList<String>( tokenizer.countTokens()); toks.add(tokenizer.nextToken()); tagsWithCont.add("B-" + tag); while (tokenizer.hasMoreTokens()) { toks.add(tokenizer.nextToken()); tagsWithCont.add("I-" + tag); } if (contraction != null) { con.addAll(Arrays.asList(new String[toks.size() - 1])); con.add(contraction); } else { con.addAll(Arrays.asList(new String[toks.size()])); } sentence.addAll(toks); tags.addAll(tagsWithCont); prop.addAll(Arrays.asList(new String[toks.size()])); } else { sentence.add(lexeme); tags.add(tag); prop.add(null); con.add(contraction); } } else if(lexeme.contains("-") && lexeme.length() > 1) { Matcher matcher = hyphenPattern.matcher(lexeme); String firstTok = null; String hyphen = "-"; String secondTok = null; String rest = null; if (matcher.matches()) { if (matcher.group(1) != null) { firstTok = matcher.group(2); } else if (matcher.group(3) != null) { secondTok = matcher.group(4); rest = matcher.group(5); } else if (matcher.group(6) != null) { firstTok = matcher.group(7); secondTok = matcher.group(8); rest = matcher.group(9); } else { throw new IllegalStateException("wrong hyphen pattern"); } if (!StringsUtil.isNullOrEmpty(firstTok)) { sentence.add(firstTok); tags.add(tag); prop.add(null); con.add(contraction); } if (!StringsUtil.isNullOrEmpty(hyphen)) { sentence.add(hyphen); tags.add("-"); prop.add(null); con.add(contraction); } if (!StringsUtil.isNullOrEmpty(secondTok)) { sentence.add(secondTok); tags.add(tag); prop.add(null); con.add(contraction); } if (!StringsUtil.isNullOrEmpty(rest)) { sentence.add(rest); tags.add(tag); prop.add(null); con.add(contraction); } } else { sentence.add(lexeme); tags.add(tag); prop.add(null); con.add(contraction); } } else { tag = addGender(tag, leaf.getMorphologicalTag()); sentence.add(lexeme); tags.add(tag); prop.add(null); con.add(contraction); } } } private static final Pattern GENDER_M = Pattern.compile(".*\\bM\\b.*"); private static final Pattern GENDER_F = Pattern.compile(".*\\bF\\b.*"); private static final Pattern GENDER_N = Pattern.compile(".*\\bM/F\\b.*"); private String addGender(String tag, String morphologicalTag) { if(("n".equals(tag) || "art".equals(tag)) && morphologicalTag != null) { if(GENDER_N.matcher(morphologicalTag).matches()) { //tag = tag + "n"; } else if(GENDER_M.matcher(morphologicalTag).matches()) { tag = tag + "m"; } else if(GENDER_F.matcher(morphologicalTag).matches()) { tag = tag + "f"; } } return tag; } public void reset() throws IOException, UnsupportedOperationException { adSentenceStream.reset(); } public void close() throws IOException { adSentenceStream.close(); } }