package com.statusParser;
/**
* Word based shingle algorithm
*
* @author sadv1r
*
* @version 0.8
*
*
*/
public class Shingle {
private static final String STOP_SYMBOLS[] = {".",",","!","?",":",";","-","\\","/","*","(",")"};
private static final String STOP_WORDS[] = {"это", "как", "так", "и", "в", "над", "к", "до", "не", "на", "но", "за", "то", "с", "ли", "а", "во", "от", "со", "для", "о", "же", "ну", "вы", "бы", "что", "кто", "он", "она"};
private static final int SHINGLE_LEN = 2;
private String canonize(String str) {
for (String stopSymbol : STOP_SYMBOLS) {
str = str.replace(stopSymbol, "");
}
for (String stopWord : STOP_WORDS) {
str = str.replace(" " + stopWord + " ", " ");
}
return str;
}
public String genShingle(String strNew) {
String str = canonize(strNew.toLowerCase());
String words[] = str.split(" ");
int shinglesNumber = words.length - SHINGLE_LEN;
String shingles = "";
//Create all shingles
for (int i = 0; i <= shinglesNumber; i++) {
String shingle = "";
//Create one shingle
for (int j = 0; j < SHINGLE_LEN; j++) {
shingle = shingle + words[i+j] + " ";
}
shingles = shingles + shingle.hashCode() + ";";
}
return shingles;
}
public int compare(String textShingles1New, String textShingles2New) {
//textShingles1New and textShingles2New equals "" or null bug fix
if (textShingles1New.equals("") || textShingles2New.equals("") ||
textShingles1New.equals(" ") || textShingles2New.equals(" ") ||
textShingles1New == null || textShingles2New == null) return 0;
String textShingles1[] = textShingles1New.split(";");
String textShingles2[] = textShingles2New.split(";");
int textShingles1Number = textShingles1.length;
int textShingles2Number = textShingles2.length;
int textShingles1Int[] = new int[textShingles1Number];
int textShingles2Int[] = new int[textShingles2Number];
for (int i=0; i<textShingles1Number;i++) {
textShingles1Int[i] = Integer.parseInt(textShingles1[i]);
}
for (int i=0; i<textShingles2Number;i++) {
textShingles2Int[i] = Integer.parseInt(textShingles2[i]);
}
double similarShinglesNumber = 0;
for (int i=0;i<textShingles1Number;i++) {
for (int j=0;j<textShingles2Number;j++) {
if (textShingles1Int[i] == textShingles2Int[j]) similarShinglesNumber++;
}
}
return (int) ((similarShinglesNumber / ((textShingles1Number + textShingles2Number) / 2.0)) * 100);
}
}