package org.basex.util.ft; import org.basex.core.Prop; import org.basex.util.Reflect; import static org.basex.util.Token.*; import org.basex.util.TokenBuilder; import org.basex.util.Util; import static org.basex.util.ft.FTFlag.*; import java.io.File; import java.lang.reflect.Constructor; import java.lang.reflect.Field; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; /** * Japanese lexer using igo (http://igo.sourceforge.jp/). * * @author BaseX Team 2005-12, BSD License * @author Toshio HIRAI */ public class JapaneseTokenizer extends Tokenizer { /** Flag available. */ private static boolean available = true; /** Name of the Igo tagger class. */ private static final String PATTERN = "net.reduls.igo.Tagger"; /** Name of Japanese dictionary. */ private static final String LANG = "ja"; /** The kind of POS(Noun). */ private static final String MEISHI = "\u540D\u8A5E"; /** The kind of POS(Pre-noun Adjectival). */ private static final String RENTAISHI = "\u9023\u4F53\u8A5E"; /** The kind of POS(Adverb). */ private static final String HUKUSHI = "\u526F\u8A5E"; /** The kind of POS(Verb). */ private static final String DOUSHI = "\u52D5\u8A5E"; /** The kind of POS(Conjunction). */ private static final String SETSUZOKUSHI = "\u63A5\u982D\u8A5E"; /** The kind of POS(Modal verbs). */ private static final String JYODOUSHI = "\u52A9\u52D5\u8A5E"; /** The kind of POS(Postpositional particle). */ private static final String JYOSHI = "\u52A9\u8A5E"; /** The kind of POS(Adjective). */ private static final String KEIYOUSHI = "\u5F62\u5BB9\u8A5E"; /** The kind of POS(Mark). */ private static final String KIGOU = "\u8A18\u53F7"; /** The kind of POS(Interjection). */ private static final String KANDOUSHI = "\u8A18\u53F7"; /** The kind of POS(Filler). */ private static final String FILLER = "\u30D5\u30A3\u30E9\u30FC"; /** Constant of Feature(Mark). */ private static final String KIGOU_FEATURE = "\u8A18\u53F7,*,*,*,*,*,*,*,*"; /** Constant of Feature(Noun). */ private static final String MEISHI_FEATURE = "\u540D\u8A5E,*,*,*,*,*,*,*,*"; /** Igo instance. */ private static Object tagger; /** Parse method. */ private static Method parse; /** Surface field. */ private static Field surface; /** feature field. */ private static Field feature; /** Start field. */ private static Field start; /** Token iterator. */ private Iterator<Morpheme> tokens; /** Token list. */ private ArrayList<Morpheme> tokenList = new ArrayList<Morpheme>(); /** Current position of the Token list. */ private int cpos; /** Current token. */ private Morpheme currToken; /** Diacritics flag. */ private final boolean dc; /** Sensitivity flag. */ private final boolean cs; /** Uppercase flag. */ private final boolean uc; /** Lowercase flag. */ private final boolean lc; /** Wildcard flag. */ private final boolean wc; /** Stemming flag. */ private final boolean st; /** Token position. */ private int pos = -1; /** Flag indicating a special character. */ private boolean sc; static { File dic = null; if(!Reflect.available(PATTERN)) { available = false; } else { dic = new File(LANG); if(!dic.exists()) { dic = new File(Prop.HOME, "etc/" + LANG); if(!dic.exists()) { available = false; } } } if(available) { Class<?> clz = Reflect.find(PATTERN); if(clz == null) { Util.debug("Could not initialize Igo Japanese lexer."); } else { /* Igo constructor. */ final Constructor<?> tgr = Reflect.find(clz, String.class); tagger = Reflect.get(tgr, dic.toString()); if(tagger == null) { available = false; Util.debug("Could not initialize Igo Japanese lexer."); } else { parse = Reflect.method(clz, "parse", CharSequence.class); if(parse == null) { Util.debug("Could not initialize Igo lexer method."); } clz = Reflect.find("net.reduls.igo.Morpheme"); surface = Reflect.field(clz, "surface"); feature = Reflect.field(clz, "feature"); start = Reflect.field(clz, "start"); } } } } /** * Checks if the library is available. * @return result of check */ static boolean available() { return available; } /** * Constructor. * @param fto (optional) full-text options */ public JapaneseTokenizer(final FTOpt fto) { lc = fto != null && fto.is(LC); uc = fto != null && fto.is(UC); cs = fto != null && fto.is(CS); wc = fto != null && fto.is(WC); dc = fto != null && fto.is(DC); st = fto != null && fto.is(ST); } @Override Tokenizer get(final FTOpt f) { return new JapaneseTokenizer(f); } @Override public JapaneseTokenizer init(final byte[] txt) { String source = string(txt); if(wc) { // convert wide-space to space source = source.replace('\u3000', '\u0020'); } final ArrayList<?> morpheme = (ArrayList<?>) Reflect.invoke(parse, tagger, source); final ArrayList<Morpheme> list = new ArrayList<Morpheme>(); try { int prev = 0; for(int i = 0; i < morpheme.size(); i++) { final Object m = morpheme.get(i); final String srfc = surface.get(m).toString(); final String ftr = feature.get(m).toString(); final int s = start.getInt(m); if(i != 0) { final int l = s - prev; if(l != 0) { list.add(new Morpheme( source.substring(s - 1, s + l - 1), KIGOU_FEATURE) ); } } prev = srfc.length() + s; // separates continuous mark (ASCII) boolean cont = true; final ArrayList<Morpheme> marks = new ArrayList<Morpheme>(); for(int j = 0; j < srfc.length(); j++) { final String c = String.valueOf(srfc.charAt(j)); final byte[] t = token(c); if(t.length == 1) if(letter(t[0]) || digit(t[0])) cont = false; else marks.add(new Morpheme(c, KIGOU_FEATURE)); else cont = false; } if(cont) list.addAll(marks); else list.add(new Morpheme(srfc, ftr)); } } catch(final Exception ex) { Util.errln(Util.name(this) + ": " + ex); } tokenList = list; tokens = list.iterator(); return this; } /** * Returns whether the special character. * @param s string * @return result of check */ private static boolean isFtChar(final String s) { return s.equals(".") || s.equals("?") || s.equals("*") || s.equals("+") || s.equals("\\") || s.equals("{") || s.equals("}"); } /** * Returns whether the following token exists (using wildcards). * @return result of check */ private boolean moreWC() { final StringBuilder word = new StringBuilder(); final int size = tokenList.size(); boolean period = false; boolean bs = false; boolean more = false; for(; cpos < size; cpos++) { String cSrfc = tokenList.get(cpos).getSurface(); final boolean cMark = tokenList.get(cpos).isMark(); String nSrfc = null; boolean nMark = false; if(cpos < size - 1) { nSrfc = tokenList.get(cpos + 1).getSurface(); nMark = tokenList.get(cpos + 1).isMark(); } if(nSrfc != null) { if(cSrfc.equals("\\")) bs = true; // delimiter if(cMark && !isFtChar(cSrfc) || cSrfc.equals("\\") && nMark) { period = false; bs = false; if(word.length() != 0) { more = true; break; } if(cSrfc.equals("\\") && nMark) cpos++; continue; } word.append(cSrfc); if(bs || nSrfc.equals("\\")) { more = true; continue; } if(cSrfc.equals(".") || nSrfc.equals(".")) { period = true; continue; } if(period) { if(cSrfc.equals("{")) { cpos++; for(; cpos < size; cpos++) { cSrfc = tokenList.get(cpos).getSurface(); word.append(cSrfc); if(cSrfc.equals("}")) { more = true; break; } } cpos++; break; } continue; } } else { // last token. if(cMark) { if(cSrfc.equals("\\")) continue; if(word.length() != 0) { word.append(cSrfc); } more = true; continue; } } if(period) { word.append(cSrfc); } else { if(bs) if(!isFtChar(cSrfc)) word.append(cSrfc); else word.setLength(0); } more = true; cpos++; break; } if(more) { currToken = word.length() != 0 ? new Morpheme(word.toString(), MEISHI_FEATURE) : tokenList.get(cpos - 1); } return more; } /** * Returns whether the following token exists. * @return result */ private boolean more() { if(special) { return tokens.hasNext(); } while(tokens.hasNext()) { currToken = tokens.next(); if(!currToken.isMark() && !currToken.isAttachedWord()) return true; } return false; } @Override public boolean hasNext() { return wc ? moreWC() : more(); } @Override public FTSpan next() { return new FTSpan(nextToken(), pos, sc); } /** * Returns the effective token. * @return token */ private byte[] get() { pos++; String n = currToken.getSurface(); final int hinshi = currToken.getHinshi(); if(st && (hinshi == Morpheme.HINSHI_DOUSHI || hinshi == Morpheme.HINSHI_KEIYOUSHI)) { n = currToken.getBaseForm(); } byte[] token = token(n); final boolean a = ascii(token); if(!a && !dc) token = WesternTokenizer.dia(token); if(uc) token = WesternTokenizer.upper(token, a); if(lc || !cs) token = WesternTokenizer.lower(token, a); return toHankaku(token); } /** * Returns the token which contains special character. * @return token */ private byte[] getSC() { final Morpheme m = tokens.next(); final String n = m.getSurface(); if(m.isMark() || m.isAttachedWord()) sc = true; else { pos++; sc = false; } return token(n); } @Override public byte[] nextToken() { return special ? getSC() : get(); } @Override protected byte prec() { return 20; } @Override Collection<Language> languages() { return collection(LANG); } /** * Converts to HANKAKU characters. * @param s Japanese text * @return result of conversion(->HANKAKU) */ private static byte[] toHankaku(final byte[] s) { if(ascii(s)) return s; final TokenBuilder tb = new TokenBuilder(s.length); for(int p = 0; p < s.length; p += cl(s, p)) { final int c = cp(s, p); if(c >= 0xFF10 && c <= 0xFF19 || c >= 0xFF21 && c <= 0xFF3A || c >= 0xFF41 && c <= 0xFF5A) { tb.add(c - 0xFEE0); } else if(c == 0x3000) { // IDEOGRAPHIC SPACE tb.add(0x0020); } else if(c == 0xFF01) { // ! tb.add(0x0021); } else if(c == 0x201D) { // " tb.add(0x0022); } else if(c == 0xFF03) { // # tb.add(0x0023); } else if(c == 0xFF04) { // $ tb.add(0x0024); } else if(c == 0xFF05) { // % tb.add(0x0025); } else if(c == 0xFF06) { // & tb.add(0x0026); } else if(c == 0x2019) { // ' tb.add(0x0027); } else if(c == 0xFF08) { // ( tb.add(0x0028); } else if(c == 0xFF09) { // ) tb.add(0x0029); } else if(c == 0xFF0A) { // * tb.add(0x002A); } else if(c == 0xFF0B) { // + tb.add(0x002B); } else if(c == 0xFF0C) { // , tb.add(0x002C); } else if(c == 0xFF0D) { // - tb.add(0x002D); } else if(c == 0xFF0E) { // . tb.add(0x002E); } else if(c == 0xFF0F) { // / tb.add(0x002F); } else if(c == 0xFF1A) { // : tb.add(0x003A); } else if(c == 0xFF1B) { // ; tb.add(0x003B); } else if(c == 0xFF1C) { // < tb.add(0x003C); } else if(c == 0xFF1D) { // = tb.add(0x003D); } else if(c == 0xFF1E) { // > tb.add(0x003E); } else if(c == 0xFF1F) { // ? tb.add(0x003F); } else if(c == 0xFF20) { // @ tb.add(0x0040); } else if(c == 0xFF3B) { // [ tb.add(0x005B); } else if(c == 0xFFE5) { // \ tb.add(0x005C); } else if(c == 0xFF3D) { // ] tb.add(0x005D); } else if(c == 0xFF3E) { // ^ tb.add(0x005E); } else if(c == 0xFF3F) { // _ tb.add(0x005F); } else if(c == 0xFF40) { // ` tb.add(0x0060); } else if(c == 0xFF5B) { // { tb.add(0x007B); } else if(c == 0xFF5C) { // | tb.add(0x007C); } else if(c == 0xFF5D) { // } tb.add(0x007D); } else if(c == 0xFF5E) { // ~ tb.add(0x007E); } else { tb.add(c); } } return tb.finish(); } /** Morpheme class. */ static class Morpheme { /** A part of speech in the context, NEISHI(Noun). */ static final int HINSHI_MEISHI = 1; /** A part of speech in the context, RENTAISHI(Pre-noun Adjectival). */ static final int HINSHI_RENTAISHI = 2; /** A part of speech in the context, HUKUSHI(Adverb). */ static final int HINSHI_HUKUSHI = 3; /** A part of speech in the context, DOUSHI(Verb). */ static final int HINSHI_DOUSHI = 4; /** A part of speech in the context, SETSUZOKUSHI(Conjunction). */ static final int HINSHI_SETSUZOKUSHI = 5; /** A part of speech in the context, JYODOUSHI(Modal verbs). */ static final int HINSHI_JYODOUSHI = 6; /** A part of speech in the context, JYOSHI(Postpositional particle). */ static final int HINSHI_JYOSHI = 7; /** A part of speech in the context, KEIYOUSHI(Adjective). */ static final int HINSHI_KEIYOUSHI = 8; /** A part of speech in the context, KIGOU(Mark). */ static final int HINSHI_KIGOU = 9; /** A part of speech in the context, KANDOUSHI(Interjection). */ static final int HINSHI_KANDOUSHI = 10; /** A part of speech in the context, FILLER(Filler). */ static final int HINSHI_FILLER = 11; /** A part of speech in the context, Others. */ static final int HINSHI_SONOTA = 0; /** Surface of Morpheme. */ private final String mSurface; /** Feature of Morpheme. */ private final String mFeature; /** * Constructor. * @param srfc surface * @param ftr feature */ Morpheme(final String srfc, final String ftr) { mSurface = srfc; mFeature = ftr; } /** * Returns surface. * @return Surface */ public String getSurface() { return mSurface; } /** * Returns whether the avoid token. * @return result */ public boolean isMark() { final int hinshi = getHinshi(); return hinshi == HINSHI_KIGOU || hinshi == HINSHI_FILLER; } /** * Tests an attached word(FUZOKU-GO). * @return result */ public boolean isAttachedWord() { final int hinshi = getHinshi(); return hinshi == HINSHI_JYODOUSHI || hinshi == HINSHI_JYOSHI; } /** * Returns the part of speech. * @return part of speech */ public int getHinshi() { final int hinshi; // morphological analyzer certainly returns // the single ascii char as a "noun". final byte[] s = token(mSurface); if(s.length == 1 && !letter(s[0]) && !digit(s[0])) { hinshi = HINSHI_KIGOU; } else { final String h = getPos(); if(h.equals(MEISHI)) { hinshi = HINSHI_MEISHI; } else if(h.equals(RENTAISHI)) { hinshi = HINSHI_RENTAISHI; } else if(h.equals(HUKUSHI)) { hinshi = HINSHI_HUKUSHI; } else if(h.equals(DOUSHI)) { hinshi = HINSHI_DOUSHI; } else if(h.equals(SETSUZOKUSHI)) { hinshi = HINSHI_SETSUZOKUSHI; } else if(h.equals(JYODOUSHI)) { hinshi = HINSHI_JYODOUSHI; } else if(h.equals(JYOSHI)) { hinshi = HINSHI_JYOSHI; } else if(h.equals(KEIYOUSHI)) { hinshi = HINSHI_KEIYOUSHI; } else if(h.equals(KIGOU)) { hinshi = HINSHI_KIGOU; } else if(h.equals(KANDOUSHI)) { hinshi = HINSHI_KANDOUSHI; } else if(h.equals(FILLER)) { hinshi = HINSHI_FILLER; } else { hinshi = HINSHI_SONOTA; } } return hinshi; } /** * Retrieves base form from feature. * @return base form */ public String getBaseForm() { final String[] parts = mFeature.split(","); return parts[6]; } /** * Retrieves parts of speech from feature. * @return parts of speech(coding in Japanese) */ private String getPos() { final String[] parts = mFeature.split(","); return parts[0]; } @Override public String toString() { return mSurface; } } }