package eu.project.ttc.eval; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import eu.project.ttc.engines.cleaner.TermProperty; import eu.project.ttc.models.Term; import eu.project.ttc.models.TermIndex; import eu.project.ttc.models.index.CustomTermIndex; import eu.project.ttc.models.index.TermIndexes; public class Tsv3ColFile { private static final Logger LOGGER = LoggerFactory.getLogger(Tsv3ColFile.class); private Path path; public Tsv3ColFile(Path path) { super(); Preconditions.checkArgument(path.toFile().exists(), "File %s does not exist", path.toString()); Preconditions.checkArgument(path.toFile().isFile(), "Not a file: %s", path.toString()); this.path = path; } public List<String[]> getLines() throws IOException { return lines() .collect(Collectors.toList()); } public Stream<Term[]> pairs(TermIndex sourceTermino, TermIndex targetTermino) throws IOException { CustomTermIndex sourceLemmaIndex = sourceTermino.getCustomIndex(TermIndexes.LEMMA_LOWER_CASE); CustomTermIndex targetLemmaIndex = targetTermino.getCustomIndex(TermIndexes.LEMMA_LOWER_CASE); return lines().filter(line -> { if(!sourceLemmaIndex.containsKey(line[1])) { LOGGER.debug("Ignoring ref line <{}> (term not found in source terminology)", Joiner.on(" ").join(line)); return false; } if(!targetLemmaIndex.containsKey(line[2])) { LOGGER.debug("Ignoring ref line <{}> (term not found in target terminology)", Joiner.on(" ").join(line)); return false; } return true; }).map(line -> { List<Term> sources = sourceLemmaIndex.getTerms(line[1]); Collections.sort(sources, TermProperty.FREQUENCY.getComparator(true)); List<Term> targets = targetLemmaIndex.getTerms(line[2]); Collections.sort(targets, TermProperty.FREQUENCY.getComparator(true)); LOGGER.debug("Reading eval pair. Source: <{}>. Target: <{}>", sources.get(0), targets.get(0)); return new Term[]{sources.get(0), targets.get(0)}; }); } public Stream<String[]> lines() throws IOException { return Files.lines(path) .map(line -> line.contains("#") ? line.substring(0, line.indexOf('#')) : line) .map(line -> line.trim()) .filter(line -> !line.isEmpty()) .map(line -> { List<String> list = Arrays.stream(line.split("\t")).map(col -> col.trim()).collect(Collectors.toList()); return list.toArray(new String[list.size()]); }) .filter(line -> { if(line.length < 3) { LOGGER.warn("Ignoring line <{}>. Only {} columns", Joiner.on('t').join(line), line.length); return false; } else return true; }) .map(line -> { if(line.length > 3) { LOGGER.warn("Line <{}> has {} columns (only three columns required). Ignoring additional columns.", Joiner.on('t').join(line), line.length); return new String[] {line[0], line[1], line[2]}; } else return line; }); } }