/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.io;
import java.util.List;
import java.util.regex.Pattern;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.project.ttc.engines.morpho.Segment;
import eu.project.ttc.engines.morpho.Segmentation;
import fr.univnantes.julestar.uima.resources.ResourceFormatException;
public class SegmentationParser {
private static final Pattern EMPTY_SEGMENTATION = Pattern.compile("\\s*\\[\\s*\\]\\s*");
public Segmentation parse(String string) {
if(EMPTY_SEGMENTATION.matcher(string).matches())
return new Segmentation(string);
else {
boolean inSegment = false;
int patternIndex = 0;
StringBuffer targetString = new StringBuffer();
StringBuffer currentSequence = new StringBuffer();
List<Segment> segments = Lists.newArrayList();
for(char c:string.toCharArray()) {
if(c=='[') {
if(inSegment)
fail("Illegal character \"[\" at index %d", patternIndex);
else {
currentSequence = new StringBuffer();
inSegment = true;
}
} else if(c==']') {
if(inSegment) {
String substring = currentSequence.toString().trim();
if(substring.isEmpty())
fail("Empty segment not allowed");
Segment segment = toSegment(targetString.length(), substring);
segments.add(segment);
inSegment = false;
targetString.append(segment.getSubstring());
currentSequence = new StringBuffer();
} else
fail("Illegal character \"]\" at index %d", patternIndex);
} else {
if(!inSegment) {
targetString.append(c);
}
currentSequence.append(c);
}
patternIndex++;
}
if(inSegment)
fail("Expected \"]\" at end of string");
Segmentation segmentation = new Segmentation(
targetString.toString().trim(),
segments.toArray(new Segment[segments.size()]));
return segmentation;
}
}
private static final String SEMI_COL = ":";
private Segment toSegment(int begin, String string) {
String trimmed = string.trim();
if(trimmed.startsWith(SEMI_COL))
fail("Cannot start segment with \":\"");
else if(trimmed.endsWith(SEMI_COL))
fail("Cannot end segment with \":\"");
List<String> ar = Splitter.on(':').splitToList(trimmed);
if(ar.size() == 0)
fail("Empty segments not allowed");
else if(ar.size() > 3)
fail("Only one \":\" allowed in segment");
else {
String substring = ar.get(0).trim();
Segment s = new Segment(begin, begin + substring.length());
s.setSubstring(substring);
if(ar.size() == 2)
s.setLemma(ar.get(1).trim());
else
s.setLemma(substring);
return s;
}
throw new IllegalStateException();
}
private void fail(String string, Object... args) {
throw new ResourceFormatException(String.format(string, args));
}
}