package com.yc.nlp.normal; import java.io.BufferedReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.yc.nlp.util.MemFile; public class Normal { private static Logger logger = LoggerFactory.getLogger(Normal.class); private Set<String> stop = new HashSet<String>(); private Map<String, String> pinyin = new HashMap<String, String>(); public Normal() { logger.debug("initialize normal begin..."); try { stop = initStop("stopwords.txt"); pinyin = initPinyin("pinyin.txt"); } catch (Exception e) { e.printStackTrace(); } logger.debug("initialize normal end..."); } public Set<String> initStop(String stopFile) throws Exception { BufferedReader br = MemFile.readFile(stopFile, this); if (br != null) { return MemFile.stopFile(br); } throw new Exception("Normal读取" + stopFile + "出错"); } public Map<String, String> initPinyin(String pyFile) throws Exception { BufferedReader br = MemFile.readFile(pyFile, this); if (br != null) { return MemFile.pyFile(br, pinyin); } throw new Exception("Normal读取" + pyFile + "出错"); } public List<String> filterStop(List<String> words) { List<String> filters = new ArrayList<String>(); for (String word : words) { if (!stop.contains(word)) { filters.add(word); } } return filters; } public String zh2hans(String sent) { return new ZH().transfer(sent); } public List<String> getSentence(String doc) { Pattern lineBreak = Pattern.compile("[\r\n]"); Pattern delimiter = Pattern.compile("[,。?!;]"); List<String> sentences = new ArrayList<String>(); for (String line : lineBreak.split(doc)) { line = line.trim(); if ("".equals(line)) { continue; } for (String sent : delimiter.split(line)) { sent = sent.trim(); if ("".equals(sent)) { continue; } sentences.add(sent); } } return sentences; } public String getPinyin(String word) { if (pinyin.containsKey(word)) { return pinyin.get(word); } String ret = ""; for (Character w : word.toCharArray()) { if (pinyin.containsKey(w.toString())) { ret += pinyin.get(w.toString()); } } return ret; } public static void main(String[] args) { System.out.println(Long.parseLong("c208000a40", 16)); } }