package yuku.alkitabconverter.internal_common;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import yuku.alkitabconverter.util.Rec;
import yuku.alkitabconverter.util.TextDb;
import yuku.bintex.BintexWriter;
public class ReverseIndexer {
public static final String TAG = ReverseIndexer.class.getSimpleName();
public final static Charset ascii = Charset.forName("ascii");
public final static Charset utf8 = Charset.forName("utf8");
public static void createReverseIndex(File outDir, String prefix, TextDb teksDb) {
Pattern p_word = Pattern.compile("[A-Za-z]+(?:[-'][A-Za-z]+)*");
Map<String, Set<Integer>> map = new TreeMap<>(new Comparator<String>() {
@Override public int compare(String o1, String o2) {
int lenc = o1.length() - o2.length();
if (lenc == 0) {
return o1.compareTo(o2);
} else {
return lenc;
}
}
});
{
int lid = 0;
for (Rec rec: teksDb.toRecList()) {
lid++;
String text = Normalizer.normalize(rec.text, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
text = text.toLowerCase();
Matcher m = p_word.matcher(text);
while (m.find()) {
String word = m.group();
Set<Integer> locations = map.get(word);
if (locations == null) {
locations = new TreeSet<>();
map.put(word, locations);
}
locations.add(lid);
}
}
System.out.println("Last lid = " + lid);
}
int maxwordlen = 0;
for (Map.Entry<String, Set<Integer>> e: map.entrySet()) {
String word = e.getKey();
System.out.println("word " + word + " lids=" + e.getValue());
if (word.length() > maxwordlen) maxwordlen = word.length();
}
System.out.println("Number of words: " + map.size());
System.out.println("Longest word: " + maxwordlen);
int stat_lid_absolute = 0;
int stat_lid_delta = 0;
try {
BintexWriter bw = new BintexWriter(new FileOutputStream(new File(outDir, String.format("%s_revindex_bt.bt", prefix))));
// :: int word_count
bw.writeInt(map.size());
// split based on word length
for (int i = 1; i <= maxwordlen; i++) {
Map<String, Set<Integer>> lenmap = new TreeMap<>();
for (Map.Entry<String, Set<Integer>> e: map.entrySet()) {
String word = e.getKey();
if (i == word.length()) {
lenmap.put(word, e.getValue());
}
}
int cnt = lenmap.size();
System.out.println("Words with length " + i + ": " + cnt);
if (cnt != 0) {
// :: uint8 word_len
// :: int word_by_len_count
bw.writeUint8(i);
bw.writeInt(cnt);
for (Map.Entry<String, Set<Integer>> e: lenmap.entrySet()) {
String word = e.getKey();
Set<Integer> lids = e.getValue();
// :: byte[word_len] word
// :: uint16 lid_count
bw.writeRaw(word.getBytes(ascii));
bw.writeUint16(lids.size());
int last_lid = 0;
for (int lid: lids) {
int delta = lid - last_lid;
if (delta <= 0x7f) {
bw.writeUint8(delta);
stat_lid_delta++;
} else {
bw.writeChar((char) (0x8000 | lid));
stat_lid_absolute++;
}
last_lid = lid;
}
}
}
}
bw.close();
System.out.println("Lid written using delta = " + stat_lid_delta);
System.out.println("Lid written using absolute = " + stat_lid_absolute);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}