package org.talend.dataquality.nlp;

import com.clearspring.analytics.util.Lists;
import com.intel.ssg.bdt.nlp.CRFModel;
import com.intel.ssg.bdt.nlp.FeatureIndex;
import com.intel.ssg.bdt.nlp.Sequence;
import com.intel.ssg.bdt.nlp.Token;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.dictionary.Dictionary;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.talend.dataquality.nlp.CRFLabeling;
import org.talend.dataquality.nlp.toolkit.AbstractToolkit;
import org.talend.dataquality.nlp.toolkit.ToolkitFactory;
import org.talend.dataquality.nlp.toolkit.ToolkitType;
import scala.Tuple2;

/* loaded from: input_file:org/talend/dataquality/nlp/FeatureConstructor.class */
public class FeatureConstructor implements Serializable {
    private static final long serialVersionUID = 1;
    AbstractToolkit nlp;
    static Dictionary dictionary;
    POS[] types;
    private Set<String> stopWordsList;
    private Set<String> firstNameList;
    private Set<String> lastNameList;
    private Set<String> acronymList;
    static int partitionSize = 3000;
    int offset;
    double percent;
    double compensation;
    HashMap<String, Boolean> pipeline;
    private String pipelinestring;
    int threshold;
    private ToolkitType tool;
    private Set<String> mostFrequentTokens;
    private Set<String> oneWordGeoSet = new HashSet();
    private Set<String> initGeoSet = new HashSet();
    private Map<String, List<String[]>> geoMap = new HashMap();
    String delimiter = "\t";
    Pattern delimiterPattern = Pattern.compile(this.delimiter);
    private LinkedHashMap<String, List<String>> mostFrequentPredecessorMap = null;
    private Character[] list = {',', '-', '+', '.'};
    private Set<Character> digitSet = new HashSet(Arrays.asList(this.list));
    private Character[] list2 = {'!', '\"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', '-', ',', '.', '/', ';', ':', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '}', '\"'};
    private Set<Character> punctSet = new HashSet(Arrays.asList(this.list2));
    private String[] list3 = {"-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-"};
    private Set<String> toRemoveSet = new HashSet(Arrays.asList(this.list3));

    /* loaded from: input_file:org/talend/dataquality/nlp/FeatureConstructor$LookupLists.class */
    enum LookupLists {
        STOPWORDLIST,
        FIRSTNAMELIST,
        LASTNAMELIST
    }

    /* loaded from: input_file:org/talend/dataquality/nlp/FeatureConstructor$TupleComparator.class */
    public class TupleComparator implements Comparator<Tuple2<String, Integer>>, Serializable {
        private static final long serialVersionUID = 1;

        public TupleComparator() {
        }

        @Override // java.util.Comparator
        public int compare(Tuple2<String, Integer> tuple2, Tuple2<String, Integer> tuple22) {
            return ((Integer) tuple2._2()).intValue() - ((Integer) tuple22._2()).intValue();
        }
    }

    public FeatureConstructor(ToolkitType toolkitType, String str) throws IOException, InstantiationException, IllegalAccessException, ClassNotFoundException, URISyntaxException, JWNLException {
        this.pipelinestring = str;
        this.nlp = ToolkitFactory.create(toolkitType);
        this.tool = toolkitType;
        try {
            dictionary = Dictionary.getDefaultResourceInstance();
        } catch (JWNLException e) {
            e.printStackTrace();
        }
        this.types = new POS[]{POS.ADJECTIVE, POS.ADVERB, POS.NOUN, POS.VERB};
        this.stopWordsList = new HashSet(Arrays.asList(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("stopwords.txt")).split("\n")));
        this.firstNameList = loadNameFile("names/census-derived-all-first.txt");
        this.lastNameList = loadNameFile("names/dist.all.last.txt");
        this.acronymList = loadNameFile("acronyms.txt");
        for (String str2 : new HashSet(Arrays.asList(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("geonames.txt")).toLowerCase().split("\n")))) {
            if (!str2.contains(" ")) {
                this.oneWordGeoSet.add(str2);
            }
            String[] split = str2.replaceAll("[\\(,'\\/\\.\\)-]", " ").split(" ");
            this.initGeoSet.add(split[0]);
            if (!this.geoMap.keySet().contains(split[0])) {
                this.geoMap.put(split[0], new ArrayList());
            }
            if (split.length > 1) {
                this.geoMap.get(split[0]).add(Arrays.copyOfRange(split, 1, split.length));
            } else {
                this.geoMap.get(split[0]).add(new String[0]);
            }
        }
        this.pipeline = new HashMap<>();
        this.pipeline.put("postag", false);
        this.pipeline.put("ner", false);
        this.pipeline.put("tokenization", false);
        this.pipeline.put("lemmatization", false);
        this.pipeline.put("stemming", false);
        this.pipeline.put("lowerwords", false);
        this.pipeline.put("tokenisnumeric", false);
        this.pipeline.put("tokenispunct", false);
        this.pipeline.put("tokeninwordnet", false);
        this.pipeline.put("tokeninstopwords", false);
        this.pipeline.put("tokeninfirstname", false);
        this.pipeline.put("tokeninlastname", false);
        this.pipeline.put("tokensuffixprefix", false);
        this.pipeline.put("tokenismostfrequent", false);
        this.pipeline.put("tokenpositionrelative", false);
        this.pipeline.put("tokeniscapitalized", false);
        this.pipeline.put("tokenisupper", false);
        this.pipeline.put("numtokeninline", false);
        this.pipeline.put("tokenmostfrequentpredecessor", false);
        this.pipeline.put("tokeninacronym", false);
        this.pipeline.put("tokeningeonames", false);
        for (String str3 : str.split(";")) {
            this.pipeline.put(str3, true);
        }
    }

    public void setSuffixPrefixOffset(int i) {
        this.offset = i;
    }

    public FeatureConstructor setMostFrequentPercent(double d) {
        this.percent = d;
        return this;
    }

    public String getPipeline() {
        return this.pipelinestring;
    }

    public FeatureConstructor setPositionRelativeCompensation(double d) {
        this.compensation = d;
        return this;
    }

    public FeatureConstructor setMostFrequentPredecessorMap(LinkedHashMap<String, List<String>> linkedHashMap) {
        this.mostFrequentPredecessorMap = linkedHashMap;
        return this;
    }

    public ToolkitType getToolkit() {
        return this.tool;
    }

    public FeatureConstructor setPredecessorOccurenceThredhold(int i) {
        this.threshold = i;
        return this;
    }

    public LinkedHashMap<String, List<String>> getPredecessorMap() {
        return this.mostFrequentPredecessorMap;
    }

    public Set<String> loadNameFile(String str) throws IOException {
        String iOUtils = IOUtils.toString(getClass().getClassLoader().getResourceAsStream(str));
        HashSet hashSet = new HashSet();
        for (String str2 : iOUtils.split("\n")) {
            hashSet.add(str2.split(" ")[0].toLowerCase());
        }
        return hashSet;
    }

    public List<String> zipFeature(List<String> list, List<String> list2) {
        if (list.size() == 0) {
            return list2;
        }
        if (list2.size() == 0) {
            return list;
        }
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i).equals("")) {
                arrayList.add(list2.get(i));
            } else if (list2.get(i).equals("")) {
                arrayList.add(list.get(i));
            } else {
                arrayList.add(list.get(i) + this.delimiter + list2.get(i));
            }
        }
        return arrayList;
    }

    public JavaRDD<List<String>> removeStopWords(JavaRDD<List<String>> javaRDD) {
        return javaRDD.map(new Function<List<String>, List<String>>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.1
            private static final long serialVersionUID = 1;

            public List<String> call(List<String> list) {
                return FeatureConstructor.this.nlp.removeStopWords(list);
            }
        });
    }

    public JavaRDD<List<String>> sentenceSplit(JavaRDD<String> javaRDD) {
        return javaRDD.map(new Function<String, List<String>>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.2
            private static final long serialVersionUID = 1;

            public List<String> call(String str) {
                return FeatureConstructor.this.nlp.sentenceSplit(str);
            }
        });
    }

    public JavaRDD<List<String>> tokenization(JavaRDD<List<String>> javaRDD) {
        return javaRDD.map(new Function<List<String>, List<String>>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.3
            private static final long serialVersionUID = 1;

            public List<String> call(List<String> list) {
                ArrayList arrayList = new ArrayList();
                Iterator<String> it = list.iterator();
                while (it.hasNext()) {
                    arrayList.add(it.next());
                }
                return FeatureConstructor.this.nlp.tokenization(arrayList);
            }
        });
    }

    public JavaRDD<List<String>> getTokenNERTags(JavaRDD<List<String>> javaRDD) {
        return javaRDD.map(new Function<List<String>, List<String>>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.4
            private static final long serialVersionUID = 1;

            public List<String> call(List<String> list) {
                ArrayList arrayList = new ArrayList();
                arrayList.addAll(FeatureConstructor.this.nlp.namedEntityRecognition(list));
                return arrayList;
            }
        });
    }

    public boolean isDigit(String str) {
        for (char c : str.toCharArray()) {
            if (!Character.isDigit(c) && !this.digitSet.contains(Character.valueOf(c))) {
                return false;
            }
        }
        return true;
    }

    public boolean isPunct(String str) {
        for (char c : str.toCharArray()) {
            if (!this.punctSet.contains(Character.valueOf(c))) {
                return false;
            }
        }
        return true;
    }

    int countLetters(String str) {
        int i = 0;
        for (char c : str.toCharArray()) {
            if (Character.isAlphabetic(Character.valueOf(c).charValue())) {
                i++;
            }
        }
        return i;
    }

    Set<String> getMostFrequentTokens(JavaRDD<List<String>> javaRDD) {
        HashSet hashSet = new HashSet();
        Iterator it = javaRDD.flatMapToPair(new PairFlatMapFunction<List<String>, String, Integer>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.5
            private static final long serialVersionUID = 1;

            public Iterable<Tuple2<String, Integer>> call(List<String> list) throws Exception {
                ArrayList arrayList = new ArrayList();
                Iterator<String> it2 = list.iterator();
                while (it2.hasNext()) {
                    arrayList.add(new Tuple2(it2.next(), 1));
                }
                return arrayList;
            }
        }).reduceByKey(new Function2<Integer, Integer, Integer>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.6
            private static final long serialVersionUID = 1;

            public Integer call(Integer num, Integer num2) {
                return Integer.valueOf(num.intValue() + num2.intValue());
            }
        }).top((int) (r0.count() * this.percent), new TupleComparator()).iterator();
        while (it.hasNext()) {
            hashSet.add(((Tuple2) it.next())._1());
        }
        return hashSet;
    }

    LinkedHashMap<String, List<String>> getMostFrequentPredecessor(JavaRDD<List<String>> javaRDD, JavaRDD<List<String>> javaRDD2) {
        Map collectAsMap = javaRDD.zip(javaRDD2).flatMapToPair(new PairFlatMapFunction<Tuple2<List<String>, List<String>>, String, String>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.7
            private static final long serialVersionUID = 1;

            public Iterable<Tuple2<String, String>> call(Tuple2<List<String>, List<String>> tuple2) {
                List list = (List) tuple2._1();
                List list2 = (List) tuple2._2();
                ArrayList arrayList = new ArrayList();
                for (int i = 0; i < list.size() - 1; i++) {
                    arrayList.add(new Tuple2(list2.get(i + 1), list.get(i)));
                }
                return arrayList;
            }
        }).groupByKey().collectAsMap();
        LinkedHashMap<String, List<String>> linkedHashMap = new LinkedHashMap<>();
        for (String str : collectAsMap.keySet()) {
            List<String> newArrayList = Lists.newArrayList((Iterable) collectAsMap.get(str));
            if (new HashSet(newArrayList).size() > this.threshold) {
                final HashMap hashMap = new HashMap();
                for (String str2 : newArrayList) {
                    hashMap.put(str2, Integer.valueOf(1 + (hashMap.containsKey(str2) ? ((Integer) hashMap.get(str2)).intValue() : 0)));
                }
                ArrayList arrayList = new ArrayList(hashMap.keySet());
                Collections.sort(arrayList, new Comparator<String>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.8
                    @Override // java.util.Comparator
                    public int compare(String str3, String str4) {
                        return ((Integer) hashMap.get(str4)).intValue() - ((Integer) hashMap.get(str3)).intValue();
                    }
                });
                linkedHashMap.put(str, new ArrayList(arrayList.subList(0, this.threshold)));
            } else {
                linkedHashMap.put(str, new ArrayList(new HashSet(newArrayList)));
            }
        }
        return linkedHashMap;
    }

    /* JADX WARN: Multi-variable type inference failed */
    List<String> construct(List<String> list, List<String> list2) throws JWNLException {
        List arrayList = new ArrayList();
        List arrayList2 = new ArrayList();
        List arrayList3 = new ArrayList();
        ArrayList arrayList4 = new ArrayList();
        List arrayList5 = new ArrayList();
        int i = 0;
        if (this.pipeline.get("postag").booleanValue()) {
            arrayList = this.nlp.partOfSpeechTagging(list);
        }
        if (this.pipeline.get("ner").booleanValue()) {
            arrayList2 = this.nlp.namedEntityRecognition(list);
        }
        if (this.pipeline.get("lemmatization").booleanValue()) {
            arrayList3 = this.nlp.lemmatization(list, arrayList);
        }
        if (this.pipeline.get("stemming").booleanValue()) {
            arrayList5 = this.nlp.stemming(list);
        }
        for (int i2 = 0; i2 < list.size(); i2++) {
            StringBuilder sb = new StringBuilder("");
            String lowerCase = list.get(i2).toLowerCase();
            if (this.pipeline.get("tokenization").booleanValue()) {
                sb.append(this.delimiter).append(list.get(i2));
            }
            if (this.pipeline.get("postag").booleanValue()) {
                sb.append(this.delimiter).append((String) arrayList.get(i2));
            }
            if (this.pipeline.get("ner").booleanValue()) {
                sb.append(this.delimiter).append((String) arrayList2.get(i2));
            }
            if (this.pipeline.get("lemmatization").booleanValue()) {
                sb.append(this.delimiter).append((String) arrayList3.get(i2));
            }
            if (this.pipeline.get("lowerwords").booleanValue()) {
                sb.append(this.delimiter).append(lowerCase);
            }
            if (this.pipeline.get("tokenisnumeric").booleanValue()) {
                sb.append(this.delimiter);
                if (isDigit(list.get(i2))) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokenispunct").booleanValue()) {
                sb.append(this.delimiter);
                if (isPunct(list.get(i2))) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokeninwordnet").booleanValue()) {
                int i3 = 0;
                for (POS pos : this.types) {
                    if (dictionary.getIndexWord(pos, list.get(i2)) == null) {
                        i3++;
                    }
                }
                sb.append(this.delimiter);
                if (i3 != 4) {
                    sb.append("T");
                } else {
                    sb.append("F");
                }
            }
            if (this.pipeline.get("tokeninstopwords").booleanValue()) {
                sb.append(this.delimiter);
                if (this.stopWordsList.contains(lowerCase)) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokeninfirstname").booleanValue()) {
                sb.append(this.delimiter);
                if (this.firstNameList.contains(lowerCase)) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokeninlastname").booleanValue()) {
                sb.append(this.delimiter);
                if (this.lastNameList.contains(lowerCase)) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokensuffixprefix").booleanValue()) {
                if (this.offset > 0) {
                    sb.append(this.delimiter).append(list.get(i2).substring(0, Math.min(this.offset, list.get(i2).length())));
                } else {
                    sb.append(this.delimiter).append(list.get(i2).substring(Math.max(0, list.get(i2).length() + this.offset), list.get(i2).length()));
                }
            }
            if (this.pipeline.get("tokenismostfrequent").booleanValue()) {
                sb.append(this.delimiter);
                if (this.mostFrequentTokens.contains(list.get(i2))) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokenpositionrelative").booleanValue()) {
                sb.append(this.delimiter).append(Double.toString((i2 + this.compensation) / (list.size() + this.compensation)));
            }
            if (this.pipeline.get("tokeniscapitalized").booleanValue()) {
                sb.append(this.delimiter);
                if (list.get(i2).length() <= 0 || !Character.isUpperCase(list.get(i2).charAt(0))) {
                    sb.append('F');
                } else {
                    sb.append('T');
                }
            }
            if (this.pipeline.get("tokenisupper").booleanValue()) {
                sb.append(this.delimiter);
                if (StringUtils.isAllUpperCase(list.get(i2))) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("tokenmostfrequentpredecessor").booleanValue()) {
                for (String str : this.mostFrequentPredecessorMap.keySet()) {
                    sb.append(this.delimiter);
                    if (this.mostFrequentPredecessorMap.get(str).contains(list.get(i2))) {
                        sb.append(this.mostFrequentPredecessorMap.get(str).indexOf(list.get(i2)) / this.mostFrequentPredecessorMap.get(str).size());
                    } else {
                        sb.append("-1");
                    }
                }
            }
            if (this.pipeline.get("tokeninacronym").booleanValue()) {
                sb.append(this.delimiter);
                if (this.acronymList.contains(lowerCase)) {
                    sb.append('T');
                } else {
                    sb.append('F');
                }
            }
            if (this.pipeline.get("stemming").booleanValue()) {
                sb.append(this.delimiter).append((String) arrayList5.get(i2));
            }
            if (this.pipeline.get("tokeningeonames").booleanValue()) {
                sb.append(this.delimiter);
                if (i > 0) {
                    sb.append('T');
                    i--;
                } else if (i == -1) {
                    sb.append('F');
                    i = 0;
                } else {
                    String replaceAll = lowerCase.replaceAll("[^a-z ]", "");
                    if (!this.initGeoSet.contains(replaceAll) || replaceAll.equals("")) {
                        if (!this.oneWordGeoSet.contains(lowerCase) || replaceAll.equals("")) {
                            sb.append('F');
                        } else {
                            sb.append('T');
                        }
                    } else if (this.geoMap.get(replaceAll).size() == 1 && this.geoMap.get(replaceAll).get(0).length == 0) {
                        sb.append('T');
                    } else {
                        i++;
                        int i4 = i2 + 1;
                        ArrayList arrayList6 = new ArrayList();
                        ArrayList arrayList7 = new ArrayList();
                        for (int i5 = 0; i5 < this.geoMap.get(replaceAll).size(); i5++) {
                            arrayList6.add(-1);
                            arrayList7.add(new LinkedList(Arrays.asList(this.geoMap.get(replaceAll).get(i5))));
                        }
                        while (true) {
                            if (i4 < list.size()) {
                                String replaceAll2 = list.get(i4).toLowerCase().replaceAll("[^a-z ]", "");
                                if (replaceAll2.equals("")) {
                                    i4++;
                                    i++;
                                } else {
                                    for (int i6 = 0; i6 < arrayList7.size(); i6++) {
                                        if (0 == 0 && arrayList7.get(i6) != null && ((List) arrayList7.get(i6)).size() == 0) {
                                            arrayList6.set(i6, 0);
                                            arrayList7.set(i6, null);
                                        }
                                        if (arrayList7.get(i6) != null && ((List) arrayList7.get(i6)).size() != 0) {
                                            if (replaceAll2.equals(((List) arrayList7.get(i6)).get(0))) {
                                                ((List) arrayList7.get(i6)).remove(0);
                                            } else {
                                                arrayList7.set(i6, null);
                                            }
                                        }
                                        if (arrayList7.get(i6) != null && ((List) arrayList7.get(i6)).size() == 0) {
                                            arrayList6.set(i6, Integer.valueOf(i));
                                        }
                                    }
                                    i4++;
                                    i++;
                                    int i7 = 0;
                                    for (int i8 = 0; i8 < arrayList7.size(); i8++) {
                                        if (arrayList7.get(i8) != null && ((List) arrayList7.get(i8)).size() != 0) {
                                            i7++;
                                        }
                                    }
                                    if (i7 == 0) {
                                        i = ((Integer) Collections.max(arrayList6)).intValue();
                                        if (i >= 0) {
                                            sb.append('T');
                                        } else if (this.oneWordGeoSet.contains(lowerCase)) {
                                            sb.append('T');
                                        } else {
                                            sb.append('F');
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            arrayList4.add(sb.toString().replaceFirst(this.delimiter, ""));
        }
        return zipFeature(arrayList4, list2);
    }

    public JavaRDD<List<String>> constructFeatures(JavaRDD<List<String>> javaRDD, JavaRDD<List<String>> javaRDD2, JavaRDD<List<String>> javaRDD3) throws IOException {
        if (this.pipeline.get("tokenismostfrequent").booleanValue()) {
            this.mostFrequentTokens = getMostFrequentTokens(javaRDD);
        }
        if (this.pipeline.get("tokenmostfrequentpredecessor").booleanValue()) {
            this.mostFrequentPredecessorMap = getMostFrequentPredecessor(javaRDD, javaRDD3);
        }
        return javaRDD.zip(javaRDD2).map(new Function<Tuple2<List<String>, List<String>>, List<String>>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.9
            private static final long serialVersionUID = 1;

            public List<String> call(Tuple2<List<String>, List<String>> tuple2) throws JWNLException {
                return FeatureConstructor.this.construct((List) tuple2._1(), (List) tuple2._2());
            }
        });
    }

    public JavaPairRDD<String, String> predict(JavaRDD<List<String>> javaRDD, JavaRDD<String> javaRDD2, JavaRDD<List<String>> javaRDD3, final CRFModel cRFModel) {
        final FeatureIndex makeFeatureIdx = cRFModel.makeFeatureIdx();
        if (this.pipeline.get("tokenismostfrequent").booleanValue()) {
            this.mostFrequentTokens = getMostFrequentTokens(javaRDD);
        }
        return javaRDD.zip(javaRDD3).zip(javaRDD2).mapToPair(new PairFunction<Tuple2<Tuple2<List<String>, List<String>>, String>, String, String>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.10
            private static final long serialVersionUID = 1;

            public Tuple2<String, String> call(Tuple2<Tuple2<List<String>, List<String>>, String> tuple2) throws JWNLException {
                Sequence sequence;
                List<String> list = (List) ((Tuple2) tuple2._1())._1();
                List<String> construct = FeatureConstructor.this.construct(list, (List) ((Tuple2) tuple2._1())._2());
                ArrayList arrayList = new ArrayList();
                for (int i = 0; i < construct.size(); i++) {
                    arrayList.add(new Token(null, FeatureConstructor.this.delimiterPattern.split(construct.get(i))));
                }
                if (arrayList.size() == 0) {
                    return new Tuple2<>("", "");
                }
                Sequence sequence2 = new Sequence((Token[]) arrayList.toArray(new Token[arrayList.size()]));
                if (sequence2.toArray().length < FeatureConstructor.partitionSize) {
                    sequence = cRFModel.testCRF(sequence2, makeFeatureIdx);
                } else {
                    Token[] array = sequence2.toArray();
                    Token[][] chunkArray = NLPProcessing.chunkArray(array, FeatureConstructor.partitionSize);
                    Token[] tokenArr = new Token[array.length];
                    int i2 = 0;
                    for (Token[] tokenArr2 : chunkArray) {
                        for (Token token : cRFModel.testCRF(new Sequence(tokenArr2), makeFeatureIdx).toArray()) {
                            tokenArr[i2] = token;
                            i2++;
                        }
                    }
                    sequence = new Sequence(tokenArr);
                }
                ArrayList arrayList2 = new ArrayList();
                for (Token token2 : sequence.toArray()) {
                    arrayList2.add(token2.label());
                }
                ArrayList arrayList3 = new ArrayList();
                for (int i3 = 0; i3 < list.size(); i3++) {
                    if (!"O".equals(arrayList2.get(i3))) {
                        ArrayList arrayList4 = new ArrayList(list.subList(0, i3));
                        arrayList4.removeAll(FeatureConstructor.this.toRemoveSet);
                        arrayList3.add(new CRFLabeling.TokenNumCharPair(list.get(i3), FeatureConstructor.this.countLetters(StringUtils.join(arrayList4, "")), (String) arrayList2.get(i3)));
                    }
                }
                return new Tuple2<>(CRFLabeling.labelSentence(arrayList3, (String) tuple2._2()), StringUtils.join(arrayList2, "\t"));
            }
        });
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public List<CRFLabeling.Scores> evaluation(JavaRDD<List<String>> javaRDD, JavaRDD<List<String>> javaRDD2, JavaRDD<List<String>> javaRDD3, final CRFModel cRFModel) {
        final FeatureIndex makeFeatureIdx = cRFModel.makeFeatureIdx();
        if (this.pipeline.get("tokenismostfrequent").booleanValue()) {
            this.mostFrequentTokens = getMostFrequentTokens(javaRDD);
        }
        return CRFLabeling.getScores(javaRDD.zip(javaRDD2).flatMap(new FlatMapFunction<Tuple2<List<String>, List<String>>, String>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.11
            private static final long serialVersionUID = 1;

            public Iterable<String> call(Tuple2<List<String>, List<String>> tuple2) throws JWNLException {
                Sequence sequence;
                List<String> construct = FeatureConstructor.this.construct((List) tuple2._1(), (List) tuple2._2());
                ArrayList arrayList = new ArrayList();
                for (int i = 0; i < construct.size(); i++) {
                    arrayList.add(new Token(null, FeatureConstructor.this.delimiterPattern.split(construct.get(i))));
                }
                if (arrayList.size() == 0) {
                    return new ArrayList();
                }
                Sequence sequence2 = new Sequence((Token[]) arrayList.toArray(new Token[arrayList.size()]));
                if (sequence2.toArray().length < FeatureConstructor.partitionSize) {
                    sequence = cRFModel.testCRF(sequence2, makeFeatureIdx);
                } else {
                    Token[] array = sequence2.toArray();
                    Token[][] chunkArray = NLPProcessing.chunkArray(array, FeatureConstructor.partitionSize);
                    Token[] tokenArr = new Token[array.length];
                    int i2 = 0;
                    for (Token[] tokenArr2 : chunkArray) {
                        for (Token token : cRFModel.testCRF(new Sequence(tokenArr2), makeFeatureIdx).toArray()) {
                            tokenArr[i2] = token;
                            i2++;
                        }
                    }
                    sequence = new Sequence(tokenArr);
                }
                ArrayList arrayList2 = new ArrayList();
                for (Token token2 : sequence.toArray()) {
                    arrayList2.add(token2.label());
                }
                return arrayList2;
            }
        }).zip(javaRDD3.flatMap(new FlatMapFunction<List<String>, String>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.12
            private static final long serialVersionUID = 1;

            public Iterable<String> call(List<String> list) {
                ArrayList arrayList = new ArrayList();
                Iterator<String> it = list.iterator();
                while (it.hasNext()) {
                    arrayList.add(it.next());
                }
                return arrayList;
            }
        })));
    }

    public JavaRDD<Integer> numTokensInLine(JavaRDD<List<String>> javaRDD) {
        return javaRDD.map(new Function<List<String>, Integer>() { // from class: org.talend.dataquality.nlp.FeatureConstructor.13
            private static final long serialVersionUID = 1;

            public Integer call(List<String> list) {
                return Integer.valueOf(list.size());
            }
        });
    }

    public static Set<String> createNickNameSet(String str) {
        String str2 = "";
        try {
            str2 = IOUtils.toString(FeatureConstructor.class.getClassLoader().getResourceAsStream(str));
        } catch (IOException e) {
            e.printStackTrace();
        }
        HashSet hashSet = new HashSet();
        for (String str3 : str2.split(" |\t|\n")) {
            hashSet.add(str3.toLowerCase());
        }
        return hashSet;
    }

    public static Map<String, Set<String>> createNickNameTable(String str, Set<String> set) {
        InputStream resourceAsStream = FeatureConstructor.class.getClassLoader().getResourceAsStream(str);
        HashMap hashMap = new HashMap();
        ArrayList<String> arrayList = new ArrayList();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(resourceAsStream));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                arrayList.add(readLine);
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        ArrayList<List> arrayList2 = new ArrayList();
        for (String str2 : arrayList) {
            arrayList2.add(new ArrayList());
            for (String str3 : str2.split(" |\t")) {
                ((List) arrayList2.get(arrayList2.size() - 1)).add(str3.toLowerCase());
            }
        }
        for (String str4 : set) {
            hashMap.put(str4, new HashSet());
            for (List list : arrayList2) {
                if (list.contains(str4)) {
                    ((Set) hashMap.get(str4)).addAll(list);
                }
            }
        }
        return hashMap;
    }

    public static Set<String> searchNickName(String str, Set<String> set, Map<String, Set<String>> map) {
        return set.contains(str) ? map.get(str) : new HashSet(Arrays.asList(str));
    }
}
