/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.reader; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import edu.emory.clir.clearnlp.dependency.DEPFeat; import edu.emory.clir.clearnlp.dependency.DEPNode; import edu.emory.clir.clearnlp.dependency.DEPTree; import edu.emory.clir.clearnlp.util.arc.AbstractArc; import edu.emory.clir.clearnlp.util.arc.DEPArc; import edu.emory.clir.clearnlp.util.arc.SRLArc; import edu.emory.clir.clearnlp.util.constant.PatternConst; import edu.emory.clir.clearnlp.util.constant.StringConst; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class TSVReader extends AbstractReader<DEPTree> { static public final String BLANK = StringConst.UNDERSCORE; /** The delimiter between columns. */ static public final String DELIM_COLUMN = StringConst.TAB; /** The delimiter between arcs. */ static public final String DELIM_ARCS = StringConst.SEMICOLON; private final Pattern P_COLUMN = PatternConst.TAB; private final Pattern P_ARCS = PatternConst.SEMICOLON; protected int i_id; protected int i_form; protected int i_lemma; protected int i_posTag; protected int i_namedEntityTag; protected int i_feats; protected int i_headID; protected int i_deprel; protected int i_xheads; protected int i_sheads; public TSVReader(int iForm) { super(TReader.TSV); init(-1, iForm, -1, -1, -1, -1, -1, -1, -1, -1); } /** For part-of-speech tagging. */ public TSVReader(int iForm, int iPOSTag) { super(TReader.TSV); init(-1, iForm, -1, iPOSTag, -1, -1, -1, -1, -1, -1); } /** For dependency parsing. */ public TSVReader(int iID, int iForm, int iLemma, int iPOSTag, int iFeats, int iHeadID, int iDeprel) { super(TReader.TSV); init(iID, iForm, iLemma, iPOSTag, -1, iFeats, iHeadID, iDeprel, -1, -1); } /** For semantic role labeling. */ public TSVReader(int iID, int iForm, int iLemma, int iPOSTag, int iFeats, int iHeadID, int iDeprel, int iSHeads) { super(TReader.TSV); init(iID, iForm, iLemma, iPOSTag, -1, iFeats, iHeadID, iDeprel, -1, iSHeads); } /** Including all. */ public TSVReader(int iID, int iForm, int iLemma, int iPOSTag, int iNERTag, int iFeats, int iHeadID, int iDeprel, int iXHeads, int iSHeads) { super(TReader.TSV); init(iID, iForm, iLemma, iPOSTag, iNERTag, iFeats, iHeadID, iDeprel, iXHeads, iSHeads); } /** * Constructs a dependency reader. * @param iID the column index of the node ID field. * @param iForm the column index of the word-form field. * @param iLemma the column index of the lemma field. * @param iPOSTag the column index of the POS field. * @param iNamedEntityTag the column index of the named entity tag field. * @param iFeats the column index of the extra features field. * @param iHeadID the column index of the head ID field. * @param iDeprel the column index of the dependency label field. * @param iXHeads the column index of the secondary dependency field. * @param iSHeads the column index of the semantic head field. */ public void init(int iID, int iForm, int iLemma, int iPOSTag, int iNamedEntityTag, int iFeats, int iHeadID, int iDeprel, int iXHeads, int iSHeads) { i_id = iID; i_form = iForm; i_lemma = iLemma; i_posTag = iPOSTag; i_namedEntityTag = iNamedEntityTag; i_feats = iFeats; i_headID = iHeadID; i_deprel = iDeprel; i_xheads = iXHeads; i_sheads = iSHeads; } @Override public AbstractReader<DEPTree> clone() { return new TSVReader(i_id, i_form, i_lemma, i_posTag, i_namedEntityTag, i_feats, i_headID, i_deprel, i_xheads, i_sheads); } @Override public DEPTree next() { DEPTree tree = null; try { List<String[]> lines = readLines(); if (lines == null) return null; tree = getDEPTree(lines); } catch (Exception e) {e.printStackTrace();} return tree; } /** Returns the next batch of lines. */ protected List<String[]> readLines() throws Exception { // skip empty lines String line; while ((line = b_reader.readLine()) != null) if (!isSkip(line)) break; // the end of the line if (line == null) { close(); return null; } // add lines List<String[]> list = new ArrayList<String[]>(); list.add(P_COLUMN.split(line)); while ((line = b_reader.readLine()) != null) { if (isSkip(line)) return list; else list.add(P_COLUMN.split(line)); } return list; } /** Called by {@link AbstractColumnReader#readLines()}. */ protected boolean isSkip(String line) { return line.trim().isEmpty(); } protected DEPTree getDEPTree(List<String[]> lines) { List<DEPNode> nodes = new ArrayList<>(); String form, lemma, pos, feats, nament; int id, i, size = lines.size(); DEPNode node; String[] tmp; // add nodes for (i=0; i<size; i++) { tmp = lines.get(i); form = tmp[i_form]; id = (i_id < 0) ? i+1 : Integer.parseInt(tmp[i_id]); lemma = (i_lemma < 0) ? null : tmp[i_lemma]; pos = (i_posTag < 0) ? null : tmp[i_posTag]; feats = (i_feats < 0) ? BLANK : tmp[i_feats]; nament = (i_namedEntityTag < 0 || tmp[i_namedEntityTag].equals(BLANK)) ? null : tmp[i_namedEntityTag]; node = new DEPNode(id, form, lemma, pos, nament, new DEPFeat(feats)); nodes.add(node); } DEPTree tree = new DEPTree(nodes); // add heads for (i=0; i<size; i++) { node = tree.get(i+1); tmp = lines.get(i); if (i_headID >= 0 && !tmp[i_headID].equals(BLANK)) node.setHead(tree.get(Integer.parseInt(tmp[i_headID])), tmp[i_deprel]); if (i_xheads >= 0) node.setSecondaryHeads(getSecondaryHeadList(tree, tmp[i_xheads])); if (i_sheads >= 0) node.setSemanticHeads(getSemanticHeadList(tree, tmp[i_sheads])); } return tree; } private List<DEPArc> getSecondaryHeadList(DEPTree tree, String heads) { List<DEPArc> arcs = new ArrayList<>(); if (heads.equals(BLANK)) return arcs; int headID, idx; String label; for (String head : P_ARCS.split(heads)) { idx = head.indexOf(AbstractArc.DELIM); headID = Integer.parseInt(head.substring(0, idx)); label = head.substring(idx+1); arcs.add(new DEPArc(tree.get(headID), label)); } return arcs; } private List<SRLArc> getSemanticHeadList(DEPTree tree, String heads) { List<SRLArc> arcs = new ArrayList<>(); if (heads.equals(BLANK)) return arcs; int headID, idx; String label; for (String head : P_ARCS.split(heads)) { idx = head.indexOf(AbstractArc.DELIM); headID = Integer.parseInt(head.substring(0, idx)); label = head.substring(idx+1); arcs.add(new SRLArc(tree.get(headID), label)); } return arcs; } public boolean hasPOSTags() { return i_posTag >= 0; } public boolean hasLemmas() { return i_lemma >= 0; } public boolean hasNamedEntityTags() { return i_namedEntityTag >= 0; } public boolean hasDependencyHeads() { return i_headID >= 0; } public boolean hasSemanticHeads() { return i_sheads >= 0; } }