SmaphUtils.java example

Explorer

smaph-erd-master
- src
  - main
    - java
      - it
        acubelab
        batframework
        datasetPlugins
        ERD2014Dataset.java
        SMAPHDataset.java
        systemPlugins
        ERDSystem.java
        WATAnnotator.java
        smaph
        IndexMatch.java
        SmaphAnnotator.java
        SmaphAnnotatorDebugger.java
        SmaphConfig.java
        SmaphUtils.java
        boldfilters
        BoldFilter.java
        EditDistanceBoldFilter.java
        FrequencyBoldFilter.java
        NoBoldFilter.java
        RankWeightBoldFilter.java
        entityfilters
        EntityFilter.java
        LibSvmEntityFilter.java
        NoEntityFilter.java
        learn
        BinaryExampleGatherer.java
        GenerateModel.java
        GenerateTrainingAndTest.java
        LibSvmFilter.java
        LibSvmUtils.java
        ModelConfigurationResult.java
        TuneModel.java
        linkback
        BaselineLinkBack.java
        DummyLinkBack.java
        LinkBack.java
        main
        ERDDatasetFilter.java
        cnr
        isti
        hpc
        erd
        Annotation.java
        Annotator.java
        WikipediaLabelToFreebaseRecord.java
        WikipediaToFreebase.java
        cli
        IndexWikipediaLabelToFreebaseIdCLI.java
        rest
        RestService.java
        mapdb
        MapDB.java
        MapDBException.java
  - test
    - java
      - it
        acubelab
        smaph
        linkback
        BaselineLinkBackTest.java
        cnr
        isti
        hpc
        erd
        SmaphUtilsTest.java
        WikipediaLabelToFreebaseIdTest.java

/**
 *  Copyright 2014 Marco Cornolti
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package it.acubelab.smaph;

import it.unipi.di.acube.batframework.utils.Pair;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import org.apache.commons.lang.StringUtils;
import org.tartarus.snowball.ext.EnglishStemmer;

public class SmaphUtils {

	/**
	 * For each word of bold, finds the word in query that has the minimum edit
	 * distance, normalized by the word length. Returns the average of those
	 * distances.
	 * 
	 * @param query
	 *            a query.
	 * @param bold
	 *            a bold.
	 * @return the averaged normalized word-by-word edit distance of bold
	 *         against query.
	 */
	public static double getMinEditDist(String query, String bold) {
		return getMinEditDist(query, bold, null);
	}

	/**
	 * For each word of bold, finds the word in query that has the minimum edit
	 * distance, normalized by the word length. Put that word in minTokens.
	 * Returns the average of those distances.
	 * 
	 * @param query
	 *            a query.
	 * @param bold
	 *            a bold.
	 * @param minTokens
	 *            the tokens of query having minimum edit distance.
	 * @return the averaged normalized word-by-word edit distance of bold
	 *         against query.
	 */
	public static double getMinEditDist(String query, String bold,
			List<String> minTokens) {
		List<String> tokensQ = tokenize(query);
		List<String> tokensB = tokenize(bold);

		if (tokensB.size() == 0 || tokensQ.size() == 0)
			return 1;

		float avgMinDist = 0;
		for (String tokenB : tokensB) {
			float minDist = Float.MAX_VALUE;
			String bestQToken = null;
			for (String tokenQ : tokensQ) {
				float relLev = getNormEditDistance(tokenB, tokenQ);
				if (relLev < minDist) {
					minDist = relLev;
					bestQToken = tokenQ;
				}
			}
			if (minTokens != null)
				minTokens.add(bestQToken);
			avgMinDist += minDist;
		}
		return avgMinDist / tokensB.size();
	}

	/**
	 * @param tokenB
	 *            a word.
	 * @param tokenQ
	 *            another word.
	 * @return the normalized edit distance between tokenB and tokenQ.
	 */
	public static float getNormEditDistance(String tokenB, String tokenQ) {
		if (tokenQ.isEmpty() || tokenB.isEmpty())
			return 1;
		int lev = StringUtils.getLevenshteinDistance(tokenB, tokenQ);
		return (float) lev / (float) Math.max(tokenB.length(), tokenQ.length());
	}

	/**
	 * @param title
	 *            the title of a Wikipedia page.
	 * @return true iff the title is that of a regular page.
	 */
	public static boolean acceptWikipediaTitle(String title) {
		// TODO: this can definitely be done in a cleaner way.
		return !(title.startsWith("Talk:") || title.startsWith("Special:")
				|| title.startsWith("Portal:")
				|| title.startsWith("Wikipedia:")
				|| title.startsWith("Wikipedia_talk:")
				|| title.startsWith("File:") || title.startsWith("User:")
				|| title.startsWith("Category:") || title.startsWith("List") || title
					.contains("(disambiguation)"));
	}

	/**
	 * @param ftrCount
	 *            the number of features.
	 * @return a vector containing all feature ids from 1 to ftrCount.
	 */
	public static Vector<Integer> getAllFtrVect(int ftrCount) {
		Vector<Integer> res = new Vector<>();
		for (int i = 1; i < ftrCount + 1; i++)
			res.add(i);
		return res;
	}

	/**
	 * Turns a list of pairs <b,r>, where b is a bold and r is the position in
	 * which the bold occurred, to the list of bolds and the hashmap between a
	 * position and the list of bolds occurring in that position.
	 * 
	 * @param boldAndRanks
	 *            a list of pairs <b,r>, where b is a bold and r is the position
	 *            in which the bold occurred.
	 * @param positions
	 *            where to store the mapping between a position (rank) and all
	 *            bolds that appear in that position.
	 * @param bolds
	 *            where to store the bolds.
	 */
	public static void mapRankToBoldsLC(
			List<Pair<String, Integer>> boldAndRanks,
			HashMap<Integer, HashSet<String>> positions, HashSet<String> bolds) {

		for (Pair<String, Integer> boldAndRank : boldAndRanks) {
			String spot = boldAndRank.first.toLowerCase();
			int rank = boldAndRank.second;
			if (bolds != null)
				bolds.add(spot);
			if (positions != null) {
				if (!positions.containsKey(rank))
					positions.put(rank, new HashSet<String>());
				positions.get(rank).add(spot);
			}
		}

	}

	/**
	 * Turns a list of pairs <b,r>, where b is a bold and r is the position in
	 * which the bold occurred, to a mapping from a bold to the positions in
	 * which the bolds occurred.
	 * 
	 * @param boldAndRanks
	 *            a list of pairs <b,r>, where b is a bold and r is the position
	 *            in which the bold occurred.
	 * @return a mapping from a bold to the positions in which the bold
	 *         occurred.
	 */
	public static HashMap<String, HashSet<Integer>> findPositionsLC(
			List<Pair<String, Integer>> boldAndRanks) {
		HashMap<String, HashSet<Integer>> positions = new HashMap<>();
		for (Pair<String, Integer> boldAndRank : boldAndRanks) {
			String bold = boldAndRank.first.toLowerCase();
			int rank = boldAndRank.second;
			if (!positions.containsKey(bold))
				positions.put(bold, new HashSet<Integer>());
			positions.get(bold).add(rank);
		}
		return positions;
	}

	/**
	 * Given a string, replaces all words with their stemmed version.
	 * 
	 * @param str
	 *            a string.
	 * @param stemmer
	 *            the stemmer.
	 * @return str with all words stemmed.
	 */
	public static String stemString(String str, EnglishStemmer stemmer) {
		String stemmedString = "";
		String[] words = str.split("\\s+");
		for (int i = 0; i < words.length; i++) {
			String word = words[i];
			stemmer.setCurrent(word);
			stemmer.stem();
			stemmedString += stemmer.getCurrent();
			if (i != words.length)
				stemmedString += " ";
		}
		return stemmedString;
	}

	/**
	 * Compress a string with GZip.
	 * 
	 * @param str
	 *            the string.
	 * @return the compressed string.
	 * @throws IOException
	 *             if something went wrong during compression.
	 */
	public static byte[] compress(String str) throws IOException {
		ByteArrayOutputStream out = new ByteArrayOutputStream();
		GZIPOutputStream gzip = new GZIPOutputStream(out);
		gzip.write(str.getBytes());
		gzip.close();
		return out.toByteArray();
	}

	/**
	 * Decompress a GZipped string.
	 * 
	 * @param compressed
	 *            the sequence of bytes
	 * @return the decompressed string.
	 * @throws IOException
	 *             if something went wrong during decompression.
	 */
	public static String decompress(byte[] compressed) throws IOException {
		GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(
				compressed));
		BufferedReader bf = new BufferedReader(new InputStreamReader(gis));
		String outStr = "";
		String line;
		while ((line = bf.readLine()) != null)
			outStr += line;
		return outStr;
	}

	public static List<String> tokenize(String text) {
		text = text.replaceAll("\\W+", " ").toLowerCase();
		Vector<String> tokens = new Vector<>(Arrays.asList(text.split("\\s+")));
		tokens.remove("");
		return tokens;
	}
}