package com.cognitionis.nlp_lang_models; import java.io.*; import java.util.*; import com.cognitionis.utils_basickit.*; import static com.cognitionis.utils_basickit.FileUtils.URL_exists; /** * * @author Héctor Llorens * @since 2011 * * This is an implementation of the famous Tenkle Text Categorization algorithm * based on character n-grams * Best known as TextCat * */ public class TextCategorizer { private final static int MIN_WORDS_4_CATEGORIZE = 5; private final static String DEFAULT_CATEGORY = "en"; // English private String conf_file_path = "/resources/lang_models/text_categorization/"; private String conf_file_name = "indoeuropean.conf"; //private String conf_file_path = "indoeuropean.conf"; private ArrayList<TextCategorizerFingerprint> categories = new ArrayList(); public TextCategorizer() { loadFingerprints(); } public TextCategorizer(String conf_file_path) { this.conf_file_path = conf_file_path; loadFingerprints(); } private void loadFingerprints() { this.categories.clear(); try { // For our beloved Windows String extra = ""; // TODO check if this is really needed if (File.separator.equals("\\")) { extra = "\\"; } String app_path = FileUtils.getApplicationPath(TextCategorizer.class); if (!URL_exists(app_path+conf_file_path)) { // Check for external resoucre (outside classes) if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) { System.out.println("look outside classes"); } app_path=app_path.replaceAll(extra + File.separator + "classes", ""); // see if we need \\ for windows } try (BufferedReader reader = new BufferedReader(new FileReader(new File(app_path+this.conf_file_path+this.conf_file_name)))) { String line; while ((line = reader.readLine()) != null) { String[] line_arr = line.split("\\s+"); if(line_arr.length > 0){ if(line_arr.length != 2){ throw new Exception("Malformed TextCategorizer configuration file.\n\tMust contain one fingerprint file path per line."); } TextCategorizerFingerprint fp=new TextCategorizerFingerprint(app_path+this.conf_file_path+line_arr[0],line_arr[1]); categories.add(fp); } } } } catch (Exception e) { System.err.println("Errors found ("+this.getClass().getSimpleName()+"):\n\t" + e.toString() + "\n"); if(System.getProperty("DEBUG")!=null && System.getProperty("DEBUG").equalsIgnoreCase("true")){e.printStackTrace(System.err);} } } /** * categorizes only a certain amount of characters in the text. recommended * when categorizing large texts in order to increase performance. * * @param text text to be analyzed * @param limit number of characters to be analyzed * @return the category name given in the configuration file */ public String categorize(String text, int limit) { if(limit > (text.length()-1)) { limit=text.length()-1; } return this.categorize(text.substring(0,limit)); } public String categorize(String text) { if(text.length() < MIN_WORDS_4_CATEGORIZE) { return DEFAULT_CATEGORY; } TextCategorizerFingerprint fp = new TextCategorizerFingerprint(); fp.create(text.toLowerCase()); fp.categorize(categories); return fp.getCategory(); } }