package jhazm.reader; import com.infomancers.collections.yield.Yielder; import edu.stanford.nlp.ling.TaggedWord; import jhazm.Normalizer; import jhazm.tokenizer.WordTokenizer; import java.io.*; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; /** * interfaces Bijankhan Corpus (http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip) that * you must download and extract it. * * @author Mojtaba Khallash */ public class BijankhanReader { // // Fields // private final String[] punctuation = new String[] { "#", "*", ".", "؟", "!" }; private String bijankhanFile; private boolean joinedVerbParts; private String posMap; private Normalizer normalizer; private WordTokenizer tokenizer; // // Constructors // public BijankhanReader() throws IOException { this("resources/corpora/bijankhan.txt", true, "resources/data/posMaps.dat"); } public BijankhanReader(boolean joinedVerbParts) throws IOException { this("resources/corpora/bijankhan.txt", joinedVerbParts, "resources/data/posMaps.dat"); } public BijankhanReader(String posMap) throws IOException { this("resources/corpora/bijankhan.txt", true, posMap); } public BijankhanReader(boolean joinedVerbParts, String posMap) throws IOException { this("resources/corpora/bijankhan.txt", joinedVerbParts, posMap); } public BijankhanReader(String bijankhanFile, boolean joinedVerbParts, String posMap) throws IOException { this.bijankhanFile = bijankhanFile; this.joinedVerbParts = joinedVerbParts; this.posMap = posMap; this.normalizer = new Normalizer(true, false, true); this.tokenizer = new WordTokenizer(); } // // API // public Iterable<List<TaggedWord>> getSentences() { return new YieldSentence(); } // // Helper // private String getBijankhanFile() { return bijankhanFile; } private boolean isJoinedVerbParts() { return joinedVerbParts; } private HashMap getPosMap() throws IOException { if (this.posMap != null) { HashMap mapper = new HashMap(); for (String line : Files.readAllLines(Paths.get(this.posMap), Charset.forName("UTF8"))) { String[] parts = line.split(","); mapper.put(parts[0], parts[1]); } return mapper; } else return null; } private Normalizer getNormalizer() { return normalizer; } class YieldSentence extends Yielder<List<TaggedWord>> { private BufferedReader br; public YieldSentence() { try { FileInputStream fstream = new FileInputStream(getBijankhanFile()); DataInputStream in = new DataInputStream(fstream); br = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF8"))); } catch (Exception ex) { ex.printStackTrace(); } } @Override protected void yieldNextCore() { try { HashMap mapper = getPosMap(); List<TaggedWord> sentence = new ArrayList<>(); String line; while ((line = br.readLine()) != null) { String[] parts = line.trim().split(" +"); if (parts.length == 2) { String word = parts[0]; String tag = parts[1]; if (!(word.equals("#") || word.equals("*"))) { word = getNormalizer().run(word); if (word.isEmpty()) word = "_"; sentence.add(new TaggedWord(word, tag)); } if (tag.equals("DELM") && Arrays.asList(punctuation).contains(word)) { if (!sentence.isEmpty()) { if (isJoinedVerbParts()) sentence = PeykareReader.joinVerbParts(sentence); if (mapper != null) { for (TaggedWord tword : sentence) { tword.setTag(mapper.get(tword.tag()).toString()); } } yieldReturn(sentence); return; } } } } br.close(); } catch(Exception ex){ ex.printStackTrace(); } } } }