package org.talend.dataquality.nlp.toolkit;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.stemmer.PorterStemmer;
import org.apache.commons.io.IOUtils;

/* loaded from: input_file:org/talend/dataquality/nlp/toolkit/AbstractToolkit.class */
public abstract class AbstractToolkit implements Serializable {
    private static final long serialVersionUID = 1;
    String sentenceSplitModelPath;
    String tokenizationModelPath;
    String partOfSpeechTaggingModelPath;
    String namedEntityModelPath;
    String syntacticParsingModelPath;
    Pattern point = Pattern.compile("\\.{2}");
    private Set<String> stopWordsList = new HashSet(Arrays.asList(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("stopwords.txt")).split("\n")));

    /* loaded from: input_file:org/talend/dataquality/nlp/toolkit/AbstractToolkit$TokenSpan.class */
    public static class TokenSpan implements Serializable {
        private static final long serialVersionUID = 1;
        int start;
        int end;

        public TokenSpan(int i, int i2) {
            this.start = i;
            this.end = i2;
        }

        public int getStart() {
            return this.start;
        }

        public int getEnd() {
            return this.end;
        }
    }

    public abstract List<String> sentenceSplit(String str);

    public abstract List<String> tokenization(List<String> list);

    public abstract List<String> partOfSpeechTagging(List<String> list);

    public abstract List<String> namedEntityRecognition(List<String> list);

    public abstract String syntacticParsing(List<String> list);

    public abstract List<String> lemmatization(List<String> list, List<String> list2);

    public String getSentenceSplitModelPath() {
        return this.sentenceSplitModelPath;
    }

    public abstract void setSentenceSplitModelPath(String str);

    public String getTokenizationModelPath() {
        return this.tokenizationModelPath;
    }

    public abstract void setTokenizationModelPath(String str);

    public String getPartOfSpeechTaggingModelPath() {
        return this.partOfSpeechTaggingModelPath;
    }

    public abstract void setPartOfSpeechTaggingModelPath(String str);

    public String getNamedEntityyModelPath() {
        return this.namedEntityModelPath;
    }

    public abstract void setNamedEntityyModelPath(String str);

    public String getSyntacticParsingModelPath() {
        return this.syntacticParsingModelPath;
    }

    public abstract void setSyntacticParsingModelPath(String str);

    public List<String> stemming(List<String> list) {
        PorterStemmer porterStemmer = new PorterStemmer();
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            arrayList.add(porterStemmer.stem(it.next()));
        }
        return arrayList;
    }

    public List<String> removeStopWords(List<String> list) {
        ArrayList arrayList = new ArrayList();
        for (String str : list) {
            if (!this.stopWordsList.contains(str.toLowerCase())) {
                arrayList.add(str);
            }
        }
        return arrayList;
    }

    public List<TokenSpan> getTokenOffset(List<String> list, String str) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        while (i4 < list.size()) {
            int i5 = i2;
            i2 = str.substring(i).indexOf(list.get(i4)) + i;
            Matcher matcher = i4 != list.size() - 1 ? this.point.matcher(list.get(i4 + 1)) : this.point.matcher(list.get(i4));
            if (i4 >= 1 && list.get(i4 - 1).endsWith(".") && ".".equals(list.get(i4))) {
                i2 = i5;
                arrayList.add(new TokenSpan(-1, -1));
            } else if ("...".equals(list.get(i4)) && matcher.find()) {
                if (!"...".equals(matcher.group(0))) {
                    i2 = i5 + matcher.start(0);
                    i3 = i2 + matcher.group(0).length() + matcher.start(0);
                    i = i3;
                    arrayList.add(new TokenSpan(i2, i3));
                } else if (i2 >= i3) {
                    i3 = i2 + list.get(i4).length();
                    i = i3;
                    if ("``".equals(list.get(i4)) || "''".equals(list.get(i4))) {
                        arrayList.add(new TokenSpan(i2 + 1, i3));
                    } else {
                        arrayList.add(new TokenSpan(i2, i3));
                    }
                } else {
                    arrayList.add(new TokenSpan(-1, -1));
                }
            } else if (i2 >= i3) {
                i3 = i2 + list.get(i4).length();
                i = i3;
                if ("``".equals(list.get(i4)) || "''".equals(list.get(i4))) {
                    arrayList.add(new TokenSpan(i2 + 1, i3));
                } else {
                    arrayList.add(new TokenSpan(i2, i3));
                }
            } else {
                arrayList.add(new TokenSpan(-1, -1));
            }
            i4++;
        }
        return arrayList;
    }
}
