package com.gmail.dpierron.calibre.opds.indexer; import com.gmail.dpierron.calibre.configuration.ConfigurationManager; import com.gmail.dpierron.calibre.datamodel.Author; import com.gmail.dpierron.calibre.datamodel.Book; import com.gmail.dpierron.calibre.datamodel.DataModel; import com.gmail.dpierron.calibre.datamodel.Tag; import com.gmail.dpierron.tools.Helper; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.*; import java.rmi.dgc.VMID; import java.text.SimpleDateFormat; import java.util.*; /** * The index of all the catalog items (books, authors, series, etc.) * composing the catalog, with the keywords to search them full-text */ public class Index { // TODO Make this configurabel? private static final int MIN_KEYWORD_SIZE = 3; private final static Logger logger = LogManager.getLogger(Index.class); // private static Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+"); // TODO Make this configurable? // TODO Build up list further, // TODO make a language dependent private final static Collection<String> keywordsToIgnore = Arrays.asList("and", "the", "not"); Map<String, Keyword> mapOfKeywords; /** * */ public Index() { super(); mapOfKeywords = new TreeMap<String, Keyword>(); } /** * * @param toCopy */ private Index(Index toCopy) { this(); mapOfKeywords = new TreeMap<String, Keyword>(); if (toCopy.mapOfKeywords != null) mapOfKeywords.putAll(toCopy.mapOfKeywords); } /** * * @return */ public long size() { if (mapOfKeywords == null) return 0; return mapOfKeywords.size(); } /** * * @param keyword * @return */ public static String prepareKeywordForIndexing(String keyword) { if (keyword == null) return null; String result = keyword; // all the keywords are in lower case result = result.toLowerCase(Locale.ENGLISH); // String temp = Normalizer.normalize(result, Normalizer.Form.NFD); // result = pattern.matcher(temp).replaceAll(""); // trim the keywords result = result.trim(); // no space in the keyword result = result.replace(" ", ""); return result; } /** * split the given string into keywords for search indexing purposes * * TODO: ITIMPI: I think this algorithm could be revisitied to improve it * * @param text * @param pTags * @return */ private List<String> splitStringIntoKeywords(String text, boolean pTags) { List<String> result = new ArrayList<String>(); String splitTagsOn = ConfigurationManager.getCurrentProfile().getDontSplitTagsOn() ? "" : ConfigurationManager.getCurrentProfile().getSplitTagsOn(); boolean processingTags = pTags && Helper.isNotNullOrEmpty(splitTagsOn); char tagChar = ' '; if (processingTags) tagChar = splitTagsOn.charAt(0); if (Helper.isNullOrEmpty(text)) { return result; } // TODO Look at whether tokenizing string would be more efficient? // TODO For tags could use SplitOnTags character StringBuffer currentKeyword = new StringBuffer(); for (char c : text.toCharArray()) { if (Character.isLetter(c) || (processingTags && (c == tagChar))) currentKeyword.append(c); else { if (currentKeyword.length() >= MIN_KEYWORD_SIZE) { String lowerCurrentKeyword = currentKeyword.toString().toLowerCase(); if (! keywordsToIgnore.contains(lowerCurrentKeyword)) { result.add(lowerCurrentKeyword); } } currentKeyword = new StringBuffer(); } } result.add(currentKeyword.toString()); return result; } /** * * @param pKeyword * @param type * @param bookEntry */ public void addItem(String pKeyword, ItemType type, BookEntry bookEntry) { Keyword keyword; String word = prepareKeywordForIndexing(pKeyword); if (Helper.isNullOrEmpty(word)) return; if (mapOfKeywords == null) { mapOfKeywords = new TreeMap<String, Keyword>(); keyword = new Keyword(1, word); keyword.addCatalogItem(type, bookEntry); } else { keyword = mapOfKeywords.get(word); if (keyword == null) { keyword = new Keyword(mapOfKeywords.size(), word); keyword.addCatalogItem(type, bookEntry); mapOfKeywords.put(word, keyword); } else { keyword.addCatalogItem(type, bookEntry); } } } /** * @param text * @param type * @param bookEntry * @param tags */ private void indexMultipleKeywords(String text, ItemType type, BookEntry bookEntry, boolean tags) { List<String> keywords = splitStringIntoKeywords(text, tags); for (String keyword : keywords) { addItem(keyword, type, bookEntry); } } /** * * @param book * @param url * @param thumbnailUrl */ public void indexBook(Book book, String url, String thumbnailUrl) { if (logger.isTraceEnabled()) logger.trace("indexBook: book=" + book + ", url=" + url + ", thumbnailUrl=" + thumbnailUrl); if (book == null) return; BookEntry bookEntry = new BookEntry(book, url, thumbnailUrl); // parse the book title indexMultipleKeywords(book.getTitle(), ItemType.BookTitle, bookEntry, false); // parse the book comments if (ConfigurationManager.getCurrentProfile().getIndexComments()) indexMultipleKeywords(Helper.removeHtmlElements(book.getComment()), ItemType.BookComment, bookEntry, false); // parse the book series if (book.getSeries() != null) indexMultipleKeywords(book.getSeries().getName(), ItemType.Series, bookEntry, false); // parse the book authors for (Author author : book.getAuthors()) { indexMultipleKeywords(author.getName(), ItemType.Author, bookEntry, false); } // parse the book tags for (Tag tag : book.getTags()) { indexMultipleKeywords(tag.getName(), ItemType.Tag, bookEntry, true); } } /** * * @param text * @return */ private String parseForApostrophes(String text) { return text.replace("'", "\\'"); } /** * * @param text * @return */ private String parseForFrenchQuotes(String text) { return text.replace("\"", "\\\""); } /** * */ public enum FilterHintType { RemoveRare, RemoveCommon, RemoveMedian; } /** * filter the current index, by making a smaller copy of it (reducing the number of keywords) * * @param maxKeywords the maximum number of keywords in the filtered index * @param filterHint refines the selection algorithm * @return a filtered copy of the current index */ public Index filterIndex(long maxKeywords, FilterHintType filterHint) { Index result = new Index(this); // we may already be small enough ! if (size() <= maxKeywords || maxKeywords == -1) return result; // don't filter less than 10 items if (size() <= 10) return result; // sort the keywords in a list, by number of uses List<Keyword> keywords = new ArrayList<Keyword>(mapOfKeywords.size()); keywords.addAll(mapOfKeywords.values()); Collections.sort(keywords, new Comparator<Keyword>() { public int compare(Keyword o1, Keyword o2) { int o1Size = o1.catalogItems.size(); int o2Size = o2.catalogItems.size(); return o1Size - o2Size; } }); // compute how much keywords we must remove long nbKeywordsToRemove = size() - maxKeywords; // parse the keywords, removing the less desired int startingPosition = 0; int position = startingPosition; if (filterHint == FilterHintType.RemoveCommon) { position = startingPosition = keywords.size() - 1; } else if (filterHint == FilterHintType.RemoveMedian) { startingPosition = keywords.size() / 2; position = startingPosition - 1; } while ((nbKeywordsToRemove > 0) && (position >= 0) && (position < keywords.size())) { // remove the current keyword Keyword keyword = keywords.get(position); result.mapOfKeywords.remove(keyword.word); // for the "remove median" case, remove the one to the same distance, on the opposite side of the middle if (filterHint == FilterHintType.RemoveMedian) { int oppositePosition = startingPosition + (startingPosition - position); keyword = keywords.get(oppositePosition); result.mapOfKeywords.remove(keyword.word); nbKeywordsToRemove = nbKeywordsToRemove - 2; position--; } else { nbKeywordsToRemove--; if (filterHint == FilterHintType.RemoveRare) position++; else position--; } } return result; } /* public void exportToJavascript(File exportFolder) throws FileNotFoundException { List<String> sqlKeywords = new ArrayList<String>(mapOfKeywords.size()); List<String> sqlBooks = new ArrayList<String>(mapOfKeywords.size()); List<String> sqlCatalogItems = new ArrayList<String>(mapOfKeywords.size()); // clear the flag of all books for (Book book : DataModel.getListOfBooks()) { book.clearFlagged(); } for (Keyword keyword : mapOfKeywords.values()) { String kwId = Long.toString(keyword.id); { // add a line in the KEYWORDS table String kwWord = keyword.word; // no need to search for apostrophes, the keywords are already cleaned-up and uppercase String kwWeight = "" + keyword.catalogItems.size(); String sql = "tx.executeSql('INSERT INTO KEYWORDS (KW_ID, KW_WORD, KW_WEIGHT) VALUES (?, ?, ?)', ['" + kwId + "', '" + kwWord + "', '" + kwWeight + "']);"; sqlKeywords.add(sql); } for (Map.Entry<ItemType, CatalogItem> catalogItemEntry : keyword.catalogItems.entrySet()) { for (BookEntry bookEntry : catalogItemEntry.getValue().bookEntries) { String bkId = bookEntry.book.getId(); int bookId = Integer.parseInt(bkId); if (!bookEntry.book.isFlagged()) { bookEntry.book.setFlagged(); { // add a line in the BOOKS table String bkTitle = parseForApostrophes(bookEntry.book.getTitle()); String bkUrl = bookEntry.url; String bkCoverUrl = bookEntry.thumbnailUrl; String sql = "tx.executeSql('INSERT INTO BOOKS (BK_ID, BK_TITLE, BK_URL, BK_COVER_URL) VALUES (?, ?, ?, ?)', ['" + bkId + "', '" + bkTitle + "', '" + bkUrl + "','" + bkCoverUrl + "']);"; sqlBooks.add(sql); } } // add a line in the CATALOG_ITEMS table { String catType = catalogItemEntry.getKey().getCode(); String sql = "tx.executeSql('INSERT INTO CATALOG_ITEMS (KW_ID, BK_ID, CAT_TYPE) VALUES (?, ?, ?)', ['" + kwId + "', '" + bkId + "', '" + catType + "']);"; sqlCatalogItems.add(sql); } } } } // output the SQL File outputFile = new File(exportFolder, "database.js"); FileOutputStream fos = null; PrintWriter pw = null; try { fos = new FileOutputStream(outputFile); pw = new PrintWriter(fos); pw.println("// inserting Books"); pw.println("console.log('inserting Books ');"); Collections.sort(sqlBooks); for (String sql : sqlBooks) { pw.println(sql); } pw.println("// end of Books"); pw.println("console.log('finished inserting Books ');"); pw.println("// inserting Keywords"); pw.println("console.log('inserting Keywords ');"); Collections.sort(sqlKeywords); for (String sql : sqlKeywords) { pw.println(sql); } pw.println("// end of Keywords"); pw.println("console.log('finished inserting Keywords ');"); pw.println("// inserting CatalogItems"); pw.println("console.log('inserting CatalogItems ');"); Collections.sort(sqlCatalogItems); for (String sql : sqlCatalogItems) { pw.println(sql); } pw.println("// end of CatalogItems"); pw.println("console.log('finished inserting CatalogItems ');"); } finally { if (pw != null) pw.close(); } } */ /* private void writeJsonArray(String name, List<String> strings, PrintWriter pw) { pw.print("["); Iterator<String> iterator = strings.iterator(); while (iterator.hasNext()) { String string = iterator.next(); pw.print("'" + string + "'"); if (iterator.hasNext()) pw.print(","); } pw.print("]"); } */ /* private void writeJson(File exportFolder, String name, List<String> jsonData, List<String> jsonKeys) throws IOException { File outputFile = new File(exportFolder, name + ".json"); FileOutputStream fos = null; PrintWriter pw = null; try { fos = new FileOutputStream(outputFile); pw = new PrintWriter(fos); pw.println("{"); writeJsonArray(name, jsonData, pw); pw.println(","); writeJsonArray("keys", jsonKeys, pw); pw.println("}"); } finally { if (pw != null) pw.close(); } } */ /* public void exportToJSON(File exportFolder) throws IOException { List<String> jsonKeywords = new ArrayList<String>(mapOfKeywords.size()); List<String> jsonBooks = new ArrayList<String>(mapOfKeywords.size()); List<String> jsonCatalogItems = new ArrayList<String>(mapOfKeywords.size()); // clear the flag of all books for (Book book : DataModel.getListOfBooks()) { book.clearFlagged(); } for (Keyword keyword : mapOfKeywords.values()) { String kwId = Long.toString(keyword.id); { // add a line in the KEYWORDS table String kwWord = keyword.word; // no need to search for apostrophes, the keywords are already cleaned-up and uppercase String kwWeight = "" + keyword.catalogItems.size(); jsonKeywords.add(kwId); jsonKeywords.add(kwWord); jsonKeywords.add(kwWeight); } for (Map.Entry<ItemType, CatalogItem> catalogItemEntry : keyword.catalogItems.entrySet()) { for (BookEntry bookEntry : catalogItemEntry.getValue().bookEntries) { String bkId = bookEntry.book.getId(); int bookId = Integer.parseInt(bkId); if (!bookEntry.book.isFlagged()) { bookEntry.book.setFlagged(); { // add a line in the BOOKS table String bkTitle = parseForApostrophes(bookEntry.book.getTitle()); String bkUrl = bookEntry.url; String bkCoverUrl = bookEntry.thumbnailUrl; jsonBooks.add(bkId); jsonBooks.add(bkTitle); jsonBooks.add(bkUrl); jsonBooks.add(bkCoverUrl); } } // add a line in the CATALOG_ITEMS table { String catType = catalogItemEntry.getKey().getCode(); jsonCatalogItems.add(kwId); jsonCatalogItems.add(bkId); jsonCatalogItems.add(catType); } } } } writeJson(exportFolder, "books", jsonBooks, new ArrayList<String>() {{ add("bkId"); add("bkTitle"); add("bkUrl"); add("bkCoverUrl"); }}); writeJson(exportFolder, "keywords", jsonKeywords, new ArrayList<String>() {{ add("kwId"); add("kwWord"); add("kwWeight"); }}); writeJson(exportFolder, "catalogitems", jsonCatalogItems, new ArrayList<String>() {{ add("kwId"); add("bkId"); add("catType"); }}); } */ /** * Create a Javascript file from the items passed in * * TODO See if rework can write files in-line? * * @param exportFolder * @param name * @param data * @param keys * @throws IOException */ private void writeJavascript(File exportFolder, String name, List<String[]> data, List<String> keys) throws IOException { File outputFile = new File(exportFolder, name + ".js"); FileOutputStream fos = null; PrintWriter pw = null; try { fos = new FileOutputStream(outputFile); pw = new PrintWriter(fos); { pw.println("function get" + Helper.toTitleCase(name) + " () {"); pw.print(" // "); Iterator<String> iterator = keys.iterator(); while (iterator.hasNext()) { String string = iterator.next(); pw.print(string); if (iterator.hasNext()) pw.print(", "); } pw.println(); pw.println(" // " + data.size() + " elements"); pw.println(" return ["); Iterator<String[]> arrayIterator = data.iterator(); while (arrayIterator.hasNext()) { String[] stringArray = arrayIterator.next(); pw.print(" ["); for (int i = 0; i < stringArray.length; i++) { String string = stringArray[i]; pw.print("'" + string + "'"); if (i + 1 < stringArray.length) pw.print(","); } pw.print("]"); if (arrayIterator.hasNext()) pw.print(","); pw.println(); } pw.println(" ];"); pw.println("}"); } } finally { if (pw != null) pw.close(); } } public void exportToJavascriptArrays(File exportFolder) throws IOException { List<String[]> jsKeywords = new ArrayList<String[]>(mapOfKeywords.size()); List<String[]> jsBooks = new ArrayList<String[]>(mapOfKeywords.size()); List<String[]> jsCatalogItems = new ArrayList<String[]>(mapOfKeywords.size()); // clear the flag of all books for (Book book : DataModel.getListOfBooks()) { book.clearFlagged(); } // TODO: See if we can rework to write directly to files to reduce RAM usage for (Keyword keyword : mapOfKeywords.values()) { String kwId = Long.toString(keyword.id); { // add a line in the KEYWORDS table String kwWord = keyword.word; // no need to search for apostrophes, the keywords are already cleaned-up and uppercase String kwWeight = "" + keyword.size(); jsKeywords.add(new String[]{kwId, kwWord, kwWeight}); } for (Map.Entry<ItemType, CatalogItem> catalogItemEntry : keyword.catalogItems.entrySet()) { for (BookEntry bookEntry : catalogItemEntry.getValue().bookEntries) { String bkId = bookEntry.book.getId(); int bookId = Integer.parseInt(bkId); if (!bookEntry.book.isFlagged()) { bookEntry.book.setFlagged(); { // add a line in the BOOKS table String bkTitle = parseForApostrophes(bookEntry.book.getTitle()); String bkUrl = bookEntry.url; String bkThumbnailUrl = bookEntry.thumbnailUrl; jsBooks.add(new String[]{bkId, bkTitle, bkUrl, bkThumbnailUrl}); } } // add a line in the CATALOG_ITEMS table { String catType = catalogItemEntry.getKey().getCode(); jsCatalogItems.add(new String[]{kwId, bkId, catType}); } } } } List<String[]> jsIdentifier = new LinkedList<String[]>(); jsIdentifier.add(new String[]{new VMID().toString(), ConfigurationManager.getCurrentProfile().getCatalogTitle(), SimpleDateFormat.getInstance().format(new Date())}); writeJavascript(exportFolder, "identifier", jsIdentifier, Helper.listThis("id", "label", "date")); writeJavascript(exportFolder, "books", jsBooks, Helper.listThis("bkId", "bkTitle", "bkUrl", "bkThumbnailUrl")); writeJavascript(exportFolder, "keywords", jsKeywords, Helper.listThis("kwId", "kwWord", "kwWeight")); writeJavascript(exportFolder, "catalogitems", jsCatalogItems, Helper.listThis("kwId", "bkId", "catType")); } }