/*******************************************************************************
* Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*******************************************************************************/
package eu.project.ttc.resources;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.uima.resource.DataResource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.SharedResourceObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.project.ttc.types.WordAnnotation;
import fr.univnantes.lina.uima.tkregex.LabelledAnnotation;
import fr.univnantes.lina.uima.tkregex.RegexOccurrence;
public class CharacterFootprintTermFilter implements SharedResourceObject, OccurrenceFilter {
private static final Logger LOGGER = LoggerFactory.getLogger(CharacterFootprintTermFilter.class);
private static final int BAD_CHAR_RATE_THRESHOLD = 41;
private char[] allowedChars;
@Override
public boolean accept(RegexOccurrence occurrence) {
if(allowedChars == null)
return true;
int totalChars = 0;
int totalWords = 0;
int nbBadWords = 0;
MutableInt badChars = new MutableInt(0);
for(LabelledAnnotation a:occurrence.getLabelledAnnotations()) {
WordAnnotation w = (WordAnnotation) a.getAnnotation();
totalChars += w.getCoveredText().length();
totalWords += 1;
if(isBadWord(w, badChars))
nbBadWords +=1;
}
if(nbBadWords > 1)
return false;
if(totalChars <= totalWords*3 && totalWords > 1)
return false;
int badCharRate = 100*badChars.intValue()/totalChars;
if(badCharRate >= BAD_CHAR_RATE_THRESHOLD)
return false;
return true;
}
/**
*
* @param anno the word anno
* @param badChars the bad char counter. Being incremented
* @return true if the word has one bad char, false otherwise
*/
private boolean isBadWord(WordAnnotation anno, MutableInt badChars) {
final String coveredText = anno.getCoveredText();
boolean foundOneBadChar = false;
for(int i=0; i< coveredText.length(); i++) {
boolean found = false;
char c = coveredText.charAt(i);
for(char a:this.allowedChars) {
if(a==c)
found = true;
}
if(!found) {
badChars.increment();
foundOneBadChar = true;
}
}
return foundOneBadChar;
}
@Override
public void load(DataResource aData) throws ResourceInitializationException {
LOGGER.debug("Loading resource character footprint resource at: " + aData.getUri());
InputStream inputStream = null;
try {
inputStream = aData.getInputStream();
List<Character> chars = new LinkedList<Character>();
BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("UTF-8")));
int c;
while((c = br.read()) != -1) {
if(!Character.isWhitespace((char)c))
chars.add((char)c);
}
this.allowedChars = new char[chars.size()];
for(int i=0; i< chars.size(); i++)
this.allowedChars[i] = chars.get(i);
br.close();
} catch (IOException e) {
LOGGER.error("Could not load resource character footprint resource due to an exception.");
LOGGER.warn("Continuing with the TrueFilter (always accept terms)");
this.allowedChars = null;
} catch(Exception e) {
LOGGER.warn("PB loading "+aData.getUri()+". Continuing with the TrueFilter (always accept terms)");
this.allowedChars = null;
return;
} finally {
if(inputStream != null)
try {
inputStream.close();
} catch (IOException e) {
LOGGER.error("Could not close input stream.");
}
}
}
}