/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.entitycoreference.impl; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.clerezza.commons.rdf.IRI; import org.apache.stanbol.enhancer.engines.entitycoreference.Constants; import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.NounPhrase; import org.apache.stanbol.enhancer.engines.entitycoreference.datamodel.PlaceAdjectival; import org.apache.stanbol.enhancer.nlp.model.Span; import org.osgi.service.cm.ConfigurationException; /** * Contains information about several terms and properties of words we use in the {@link CoreferenceFinder}. * * @author Cristian Petroaca * */ class Dictionaries { /** * Contains the list of place adjectivals in the form: language -> adjectival -> IRI -> adjectival -> * IRI There are Places that have multiple adjectivals so in this map there are adjectivals that point * to the same IRI but that ensures a fast lookup. */ private Map<String,Map<String,IRI>> placeAdjectivalsMap; public Dictionaries(String[] languages, String entityUriBase) throws ConfigurationException { placeAdjectivalsMap = new HashMap<>(); for (String language : languages) { String line = null; Map<String,IRI> languagePlaceAdjMap = new HashMap<>(); InputStream langIn = null; BufferedReader reader = null; try { langIn = Dictionaries.class.getResourceAsStream(Constants.PLACE_ADJECTIVALS_FOLDER + "/" + language); reader = new BufferedReader(new InputStreamReader(langIn)); while ((line = reader.readLine()) != null) { String[] splittedLine = line.split("\t"); String place = splittedLine[0]; String adjectivals = splittedLine[1]; IRI ref = new IRI(entityUriBase + place.trim()); String[] adjectivalsArray = adjectivals.split(","); for (String adjectival : adjectivalsArray) { languagePlaceAdjMap.put(adjectival.trim().toLowerCase(), ref); } } placeAdjectivalsMap.put(language, languagePlaceAdjMap); } catch (IOException ioe) { throw new ConfigurationException("", "Could not read " + Constants.PLACE_ADJECTIVALS_FOLDER + "/" + language, ioe); } finally { if (langIn != null) { try { langIn.close(); } catch (IOException e) {} } if (reader != null) { try { reader.close(); } catch (IOException e) {} } } } } /** * Checks whether a {@link NounPhrase} contains a place adjectival and returns it. * * @param language * @param nounPhrase * @return the {@link PlaceAdjectival} if the {@link NounPhrase} contains one or null if not. */ public PlaceAdjectival findPlaceAdjectival(String language, NounPhrase nounPhrase) { List<Span> tokens = nounPhrase.getTokens(); Map<String,IRI> langPlaceAdjectivalsMap = placeAdjectivalsMap.get(language); /* * Go through all 1-grams and 2-grams and see if we have a match in the place adjectivals map. 2-grams * should be good enough since there are no 3-gram places at least from what I saw. */ for (int i = 0; i < tokens.size(); i++) { Span currentToken = tokens.get(i); String currentTokenString = currentToken.getSpan().toLowerCase(); // First the current 1-gram if (langPlaceAdjectivalsMap.containsKey(currentTokenString)) { return new PlaceAdjectival(currentToken.getStart(), currentToken.getEnd(), langPlaceAdjectivalsMap.get(currentTokenString)); } // Then use the 2-gram with the token before it StringBuilder concatTokens = new StringBuilder(); String concatTokensString = null; if (i > 0) { Span previousToken = tokens.get(i - 1); String previousTokenString = previousToken.getSpan().toLowerCase(); concatTokens = new StringBuilder(); concatTokens.append(previousTokenString); concatTokens.append(" "); concatTokens.append(currentTokenString); concatTokensString = concatTokens.toString(); if (langPlaceAdjectivalsMap.containsKey(concatTokensString.toLowerCase())) { return new PlaceAdjectival(previousToken.getStart(), currentToken.getEnd(), langPlaceAdjectivalsMap.get(concatTokensString)); } } // Now use the 2-gram with the token after it if (i < tokens.size() - 1) { Span nextToken = tokens.get(i + 1); String nextTokenString = nextToken.getSpan().toLowerCase(); concatTokens = new StringBuilder(); concatTokens.append(currentTokenString); concatTokens.append(" "); concatTokens.append(nextTokenString); concatTokensString = concatTokens.toString(); if (langPlaceAdjectivalsMap.containsKey(concatTokens.toString())) { return new PlaceAdjectival(currentToken.getStart(), nextToken.getEnd(), langPlaceAdjectivalsMap.get(concatTokensString)); } } } return null; } }