package org.talend.dataquality.nlp.toolkit;

import com.google.common.base.CharMatcher;
import com.google.common.collect.Lists;
import epic.models.NerSelector;
import epic.models.ParserSelector;
import epic.models.PosTagSelector;
import epic.parser.Parser;
import epic.preprocess.MLSentenceSegmenter;
import epic.preprocess.TreebankTokenizer;
import epic.sequences.CRF;
import epic.sequences.SemiCRF;
import epic.trees.AnnotatedLabel;
import epic.trees.Span;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.IndexWord;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.dictionary.Dictionary;
import scala.Tuple2;
import scala.collection.JavaConversions;

/* loaded from: input_file:org/talend/dataquality/nlp/toolkit/ScalaNLP.class */
public class ScalaNLP extends AbstractToolkit {
    private static final long serialVersionUID = 1;
    private static MLSentenceSegmenter sentenceSplitter;
    private static TreebankTokenizer tokenizer;
    private static CRF<AnnotatedLabel, String> tagger;
    private static SemiCRF<Object, String> ner;
    private static Parser<AnnotatedLabel, String> parser;
    private static Dictionary dictionary;
    private static final int separateNum = 5000;

    public ScalaNLP() throws IOException, JWNLException {
        sentenceSplitter = (MLSentenceSegmenter) MLSentenceSegmenter.bundled("en").get();
        tokenizer = new TreebankTokenizer();
        tagger = (CRF) PosTagSelector.loadTagger("en").get();
        ner = (SemiCRF) NerSelector.loadNer("en").get();
        parser = (Parser) ParserSelector.loadParser("en").get();
        dictionary = Dictionary.getDefaultResourceInstance();
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public List<String> sentenceSplit(String str) {
        String str2;
        ArrayList arrayList = new ArrayList();
        for (String str3 : JavaConversions.seqAsJavaList(sentenceSplitter.apply(str))) {
            char[] charArray = str3.toCharArray();
            int length = charArray.length;
            int i = 0;
            while (true) {
                if (i < length) {
                    if (!CharMatcher.anyOf("\r\n\t  ").matches(charArray[i])) {
                        arrayList.add(str3);
                        break;
                    }
                    i++;
                }
            }
        }
        for (int size = arrayList.size() - 1; size >= 0; size--) {
            if (((String) arrayList.get(size)).length() > separateNum) {
                ArrayList arrayList2 = new ArrayList();
                String str4 = (String) arrayList.get(size);
                int length2 = str4.length() / separateNum;
                int i2 = 0;
                for (int i3 = 0; i3 < length2; i3++) {
                    if (i3 < length2) {
                        String substring = str4.substring(i2, i2 + separateNum);
                        int lastIndexOf = substring.lastIndexOf(32);
                        if (lastIndexOf != -1) {
                            str2 = str4.substring(i2, i2 + lastIndexOf);
                            i2 += lastIndexOf;
                        } else {
                            str2 = substring;
                            i2 += separateNum;
                        }
                        char[] charArray2 = str2.toCharArray();
                        int length3 = charArray2.length;
                        int i4 = 0;
                        while (true) {
                            if (i4 < length3) {
                                if (!CharMatcher.anyOf("\r\n\t  ").matches(charArray2[i4])) {
                                    arrayList2.add(str2);
                                    break;
                                }
                                i4++;
                            }
                        }
                    } else {
                        String substring2 = str4.substring(i2);
                        char[] charArray3 = substring2.toCharArray();
                        int length4 = charArray3.length;
                        int i5 = 0;
                        while (true) {
                            if (i5 < length4) {
                                if (!CharMatcher.anyOf("\r\n\t  ").matches(charArray3[i5])) {
                                    arrayList2.add(substring2);
                                    break;
                                }
                                i5++;
                            }
                        }
                    }
                }
                arrayList.remove(size);
                arrayList.addAll(size, arrayList2);
            }
        }
        return arrayList;
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public List<String> tokenization(List<String> list) {
        ArrayList arrayList = new ArrayList();
        if (list.size() != 0) {
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                arrayList.addAll(JavaConversions.seqAsJavaList(tokenizer.apply(it.next())));
            }
        }
        arrayList.removeAll(Collections.singleton(""));
        return arrayList;
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public List<String> partOfSpeechTagging(List<String> list) {
        ArrayList arrayList = new ArrayList();
        if (list.size() != 0) {
            if (list.size() < separateNum) {
                Iterator it = JavaConversions.seqAsJavaList(tagger.bestSequence(JavaConversions.asScalaIterable(list).toIndexedSeq(), "").tags()).iterator();
                while (it.hasNext()) {
                    arrayList.add(((AnnotatedLabel) it.next()).label());
                }
            } else {
                Iterator it2 = Lists.partition(list, separateNum).iterator();
                while (it2.hasNext()) {
                    Iterator it3 = JavaConversions.seqAsJavaList(tagger.bestSequence(JavaConversions.asScalaIterable((List) it2.next()).toIndexedSeq(), "").tags()).iterator();
                    while (it3.hasNext()) {
                        arrayList.add(((AnnotatedLabel) it3.next()).label());
                    }
                }
            }
        }
        return arrayList;
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public List<String> namedEntityRecognition(List<String> list) {
        ArrayList arrayList = new ArrayList(Collections.nCopies(list.size(), "O"));
        if (list.size() != 0) {
            if (list.size() < separateNum) {
                for (Tuple2 tuple2 : JavaConversions.seqAsJavaList(ner.bestSequence(JavaConversions.asScalaIterable(list).toIndexedSeq(), "").label())) {
                    for (int begin = ((Span) tuple2._2()).begin(); begin < ((Span) tuple2._2()).end(); begin++) {
                        arrayList.set(begin, tuple2._1().toString());
                    }
                }
            } else {
                int i = 0;
                for (List list2 : Lists.partition(list, separateNum)) {
                    for (Tuple2 tuple22 : JavaConversions.seqAsJavaList(ner.bestSequence(JavaConversions.asScalaIterable(list2).toIndexedSeq(), "").label())) {
                        for (int begin2 = ((Span) tuple22._2()).begin(); begin2 < ((Span) tuple22._2()).end(); begin2++) {
                            arrayList.set(i + begin2, tuple22._1().toString());
                        }
                    }
                    i += list2.size();
                }
            }
        }
        return arrayList;
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public String syntacticParsing(List<String> list) {
        return parser.apply(JavaConversions.asScalaBuffer(list).toIndexedSeq()).toString();
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public List<String> lemmatization(List<String> list, List<String> list2) {
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            try {
                IndexWord lookupIndexWord = list2.get(i).startsWith("NN") ? dictionary.lookupIndexWord(POS.NOUN, list.get(i)) : list2.get(i).startsWith("JJ") ? dictionary.lookupIndexWord(POS.ADJECTIVE, list.get(i)) : list2.get(i).startsWith("RB") ? dictionary.lookupIndexWord(POS.ADVERB, list.get(i)) : list2.get(i).startsWith("VB") ? dictionary.lookupIndexWord(POS.VERB, list.get(i)) : null;
                if (lookupIndexWord == null) {
                    arrayList.add(list.get(i));
                } else {
                    arrayList.add(lookupIndexWord.getLemma());
                }
            } catch (JWNLException e) {
                e.printStackTrace();
            }
        }
        return arrayList;
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public void setSentenceSplitModelPath(String str) {
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public void setTokenizationModelPath(String str) {
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public void setPartOfSpeechTaggingModelPath(String str) {
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public void setNamedEntityyModelPath(String str) {
    }

    @Override // org.talend.dataquality.nlp.toolkit.AbstractToolkit
    public void setSyntacticParsingModelPath(String str) {
    }
}
