/** * Copyright 2014 Marco Cornolti * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.acubelab.smaph; import it.unipi.di.acube.batframework.utils.Pair; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Vector; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.lang.StringUtils; import org.tartarus.snowball.ext.EnglishStemmer; public class SmaphUtils { /** * For each word of bold, finds the word in query that has the minimum edit * distance, normalized by the word length. Returns the average of those * distances. * * @param query * a query. * @param bold * a bold. * @return the averaged normalized word-by-word edit distance of bold * against query. */ public static double getMinEditDist(String query, String bold) { return getMinEditDist(query, bold, null); } /** * For each word of bold, finds the word in query that has the minimum edit * distance, normalized by the word length. Put that word in minTokens. * Returns the average of those distances. * * @param query * a query. * @param bold * a bold. * @param minTokens * the tokens of query having minimum edit distance. * @return the averaged normalized word-by-word edit distance of bold * against query. */ public static double getMinEditDist(String query, String bold, List<String> minTokens) { List<String> tokensQ = tokenize(query); List<String> tokensB = tokenize(bold); if (tokensB.size() == 0 || tokensQ.size() == 0) return 1; float avgMinDist = 0; for (String tokenB : tokensB) { float minDist = Float.MAX_VALUE; String bestQToken = null; for (String tokenQ : tokensQ) { float relLev = getNormEditDistance(tokenB, tokenQ); if (relLev < minDist) { minDist = relLev; bestQToken = tokenQ; } } if (minTokens != null) minTokens.add(bestQToken); avgMinDist += minDist; } return avgMinDist / tokensB.size(); } /** * @param tokenB * a word. * @param tokenQ * another word. * @return the normalized edit distance between tokenB and tokenQ. */ public static float getNormEditDistance(String tokenB, String tokenQ) { if (tokenQ.isEmpty() || tokenB.isEmpty()) return 1; int lev = StringUtils.getLevenshteinDistance(tokenB, tokenQ); return (float) lev / (float) Math.max(tokenB.length(), tokenQ.length()); } /** * @param title * the title of a Wikipedia page. * @return true iff the title is that of a regular page. */ public static boolean acceptWikipediaTitle(String title) { // TODO: this can definitely be done in a cleaner way. return !(title.startsWith("Talk:") || title.startsWith("Special:") || title.startsWith("Portal:") || title.startsWith("Wikipedia:") || title.startsWith("Wikipedia_talk:") || title.startsWith("File:") || title.startsWith("User:") || title.startsWith("Category:") || title.startsWith("List") || title .contains("(disambiguation)")); } /** * @param ftrCount * the number of features. * @return a vector containing all feature ids from 1 to ftrCount. */ public static Vector<Integer> getAllFtrVect(int ftrCount) { Vector<Integer> res = new Vector<>(); for (int i = 1; i < ftrCount + 1; i++) res.add(i); return res; } /** * Turns a list of pairs <b,r>, where b is a bold and r is the position in * which the bold occurred, to the list of bolds and the hashmap between a * position and the list of bolds occurring in that position. * * @param boldAndRanks * a list of pairs <b,r>, where b is a bold and r is the position * in which the bold occurred. * @param positions * where to store the mapping between a position (rank) and all * bolds that appear in that position. * @param bolds * where to store the bolds. */ public static void mapRankToBoldsLC( List<Pair<String, Integer>> boldAndRanks, HashMap<Integer, HashSet<String>> positions, HashSet<String> bolds) { for (Pair<String, Integer> boldAndRank : boldAndRanks) { String spot = boldAndRank.first.toLowerCase(); int rank = boldAndRank.second; if (bolds != null) bolds.add(spot); if (positions != null) { if (!positions.containsKey(rank)) positions.put(rank, new HashSet<String>()); positions.get(rank).add(spot); } } } /** * Turns a list of pairs <b,r>, where b is a bold and r is the position in * which the bold occurred, to a mapping from a bold to the positions in * which the bolds occurred. * * @param boldAndRanks * a list of pairs <b,r>, where b is a bold and r is the position * in which the bold occurred. * @return a mapping from a bold to the positions in which the bold * occurred. */ public static HashMap<String, HashSet<Integer>> findPositionsLC( List<Pair<String, Integer>> boldAndRanks) { HashMap<String, HashSet<Integer>> positions = new HashMap<>(); for (Pair<String, Integer> boldAndRank : boldAndRanks) { String bold = boldAndRank.first.toLowerCase(); int rank = boldAndRank.second; if (!positions.containsKey(bold)) positions.put(bold, new HashSet<Integer>()); positions.get(bold).add(rank); } return positions; } /** * Given a string, replaces all words with their stemmed version. * * @param str * a string. * @param stemmer * the stemmer. * @return str with all words stemmed. */ public static String stemString(String str, EnglishStemmer stemmer) { String stemmedString = ""; String[] words = str.split("\\s+"); for (int i = 0; i < words.length; i++) { String word = words[i]; stemmer.setCurrent(word); stemmer.stem(); stemmedString += stemmer.getCurrent(); if (i != words.length) stemmedString += " "; } return stemmedString; } /** * Compress a string with GZip. * * @param str * the string. * @return the compressed string. * @throws IOException * if something went wrong during compression. */ public static byte[] compress(String str) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); GZIPOutputStream gzip = new GZIPOutputStream(out); gzip.write(str.getBytes()); gzip.close(); return out.toByteArray(); } /** * Decompress a GZipped string. * * @param compressed * the sequence of bytes * @return the decompressed string. * @throws IOException * if something went wrong during decompression. */ public static String decompress(byte[] compressed) throws IOException { GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream( compressed)); BufferedReader bf = new BufferedReader(new InputStreamReader(gis)); String outStr = ""; String line; while ((line = bf.readLine()) != null) outStr += line; return outStr; } public static List<String> tokenize(String text) { text = text.replaceAll("\\W+", " ").toLowerCase(); Vector<String> tokens = new Vector<>(Arrays.asList(text.split("\\s+"))); tokens.remove(""); return tokens; } }