package org.talend.dataquality.parsing.fullname;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.lang3.StringUtils;
import org.talend.dataquality.parsing.core.FeaturesGenerator;
import third_party.org.chokkan.crfsuite.Attribute;
import third_party.org.chokkan.crfsuite.Item;
import third_party.org.chokkan.crfsuite.ItemSequence;

/* loaded from: input_file:org/talend/dataquality/parsing/fullname/FullnameFeaturesGenerator.class */
public class FullnameFeaturesGenerator implements FeaturesGenerator {
    private static final Pattern TOKEN_CLEAN_PATTERN = Pattern.compile("(^[\\W]*)|([^.\\w]*$)");
    private static final Pattern TOKEN_ABBREV_PATTERN = Pattern.compile("[^\\p{L}]+");
    private static final Pattern FULL_BRACKETED_PATTERN = Pattern.compile("[\"(\\']\\w+[\")\\']");
    private static final Pattern HALF_BRACKETED_PATTERN = Pattern.compile("([\"(\\']\\w+)|(\\w+[\")\\'])");
    private static final Pattern ROMAN_NUMERAL_PATTERN = Pattern.compile("[xvi]+");
    private static final Pattern FRENCH_ACCENT_PATTERN = Pattern.compile("[àéèêîïùç]");
    private static final Set<String> LAST_NAME_CN = new HashSet(Arrays.asList("bai", "cai", "cao", "chen", "cheng", "cui", "dai", "deng", "ding", "dong", "du", "duan", "fan", "fang", "feng", "fu", "gao", "gong", "gu", "guo", "han", "hao", "he", "hou", "hu", "huang", "ji", "jia", "jiang", "jin", "kong", "lei", "li", "liang", "liao", "lin", "liu", "long", "lu", "luo", "ma", "mao", "meng", "mo", "pan", "peng", "qian", "qin", "qiu", "ren", "shao", "shen", "shi", "song", "su", "sun", "tan", "tang", "tao", "tian", "wan", "wang", "wei", "wu", "xi", "xia", "xiang", "xiao", "xie", "xiong", "xu", "xue", "yan", "yang", "yao", "ye", "yin", "yu", "yuan", "zeng", "zhang", "zhao", "zheng", "zhong", "zhou", "zhu", "zou"));
    private static final Set<String> PARTICLE = new HashSet(Arrays.asList("la", "le", "les", "de", "du", "des", "del", "los", "las", "y", "am", "an", "auf", "der", "aus", "im", "von", "und", "zu", "zum", "zur", "da", "degli", "dei", "della", "di", "lo", "den", "op", "van", "ten", "ter", "te", "vanden", "vander"));
    private static final Set<String> Title = new HashSet(Arrays.asList("baroness", "lord", "lady", "sir", "dame", "baron", "viscount", "earl", "marquess", "duke"));
    private static final Set<String> PREPOSITIONS = new HashSet(Arrays.asList("of", "for", "to", "on"));
    private final DoubleMetaphone doubleMetaphone = new DoubleMetaphone();

    String cleanToken(String str) {
        return TOKEN_CLEAN_PATTERN.matcher(str.toLowerCase()).replaceAll("");
    }

    String abbrevToken(String str) {
        return TOKEN_ABBREV_PATTERN.matcher(str).replaceAll("");
    }

    boolean fullBracketed(String str) {
        return FULL_BRACKETED_PATTERN.matcher(str).matches();
    }

    boolean halfBracketed(String str, Boolean bool) {
        return bool.booleanValue() && HALF_BRACKETED_PATTERN.matcher(str).matches();
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static int isVowel(char c) {
        char lowerCase = Character.toLowerCase(c);
        return (lowerCase == 'a' || lowerCase == 'e' || lowerCase == 'i' || lowerCase == 'o' || lowerCase == 'u' || lowerCase == 'y') ? 1 : 0;
    }

    boolean hasVowels(String str) {
        for (int i = 0; i < str.length(); i++) {
            if (isVowel(str.charAt(i)) == 1) {
                return true;
            }
        }
        return false;
    }

    boolean endsWithVowel(String str) {
        return str.length() >= 1 && isVowel(str.charAt(str.length() - 1)) == 1;
    }

    String vowelRatio(String str) {
        int length = str.length();
        return length > 1 ? String.valueOf(str.chars().map(i -> {
            return isVowel((char) i);
        }).sum() / length) : "false";
    }

    private List<String> ngrams(String str, int i) {
        ArrayList arrayList = new ArrayList();
        for (int i2 = 0; i2 < (str.length() - i) + 1; i2++) {
            arrayList.add(str.substring(i2, i2 + i));
        }
        return arrayList;
    }

    private List<String> prefixAndSuffix(String str) {
        ArrayList arrayList = new ArrayList();
        String sb = new StringBuilder(str).reverse().toString();
        for (int i = 1; i < str.length(); i++) {
            arrayList.add("prefix_" + i + "|" + str.substring(0, i));
            arrayList.add("suffix_" + i + "|" + sb.substring(0, i));
        }
        return arrayList;
    }

    boolean isRomanNumeral(String str) {
        return ROMAN_NUMERAL_PATTERN.matcher(str).matches();
    }

    boolean hasAccent(String str) {
        return FRENCH_ACCENT_PATTERN.matcher(str).find();
    }

    boolean isParticle(String str) {
        return PARTICLE.contains(str);
    }

    boolean isTitle(String str) {
        return Title.contains(str);
    }

    boolean isPrepo(String str) {
        return PREPOSITIONS.contains(str);
    }

    boolean isLNameCN(String str) {
        return LAST_NAME_CN.contains(str);
    }

    List<String> tokenToFeatures(String str) {
        ArrayList arrayList = new ArrayList();
        String cleanToken = cleanToken(str);
        String abbrevToken = abbrevToken(cleanToken);
        String stripAccents = StringUtils.stripAccents(abbrevToken);
        boolean fullBracketed = fullBracketed(str);
        boolean isAlpha = StringUtils.isAlpha(abbrevToken);
        arrayList.add("nopunc|" + abbrevToken);
        arrayList.add("abbrev|" + cleanToken.endsWith("."));
        arrayList.add("comma|" + str.endsWith(","));
        arrayList.add("hyphenated|" + cleanToken.contains("-"));
        arrayList.add("contracted|" + cleanToken.contains("'"));
        arrayList.add("bracketed|" + halfBracketed(str, Boolean.valueOf(!fullBracketed)));
        arrayList.add("fullbracketed|" + fullBracketed);
        arrayList.add("length|" + abbrevToken.length());
        arrayList.add("initial|" + (abbrevToken.length() == 1 && isAlpha));
        arrayList.add("has.vowels|" + hasVowels(abbrevToken));
        arrayList.add("just.letters|" + isAlpha);
        arrayList.add("roman|" + isRomanNumeral(abbrevToken));
        arrayList.add("endswith.vowel|" + endsWithVowel(stripAccents));
        arrayList.add("digits|" + StringUtils.isNumeric(abbrevToken));
        arrayList.add("metaphone1|" + this.doubleMetaphone.doubleMetaphone(abbrevToken, false));
        arrayList.add("metaphone2|" + this.doubleMetaphone.doubleMetaphone(abbrevToken, true));
        arrayList.add("more.vowels|" + vowelRatio(stripAccents));
        arrayList.add("first.name|" + NameRatio.getRatioString(stripAccents));
        arrayList.add("possessive|" + cleanToken.endsWith("'s"));
        arrayList.add("all.uppercase|" + StringUtils.isAllUpperCase(abbrevToken));
        arrayList.add("has.accent|" + hasAccent(abbrevToken));
        arrayList.add("is.particle|" + isParticle(cleanToken));
        arrayList.add("is.title|" + isTitle(cleanToken));
        arrayList.add("is.preposition|" + isPrepo(cleanToken));
        arrayList.add("lname.cn|" + isLNameCN(stripAccents));
        prefixAndSuffix(stripAccents).forEach(str2 -> {
            arrayList.add(str2);
        });
        ngrams(stripAccents, 3).forEach(str3 -> {
            arrayList.add(str3 + "|true");
        });
        ngrams(stripAccents, 4).forEach(str4 -> {
            arrayList.add(str4 + "|true");
        });
        return arrayList;
    }

    public ItemSequence getFeatureSequenceFromTokens(List<String> list) {
        ItemSequence itemSequence = new ItemSequence();
        if (list.size() == 1) {
            Item item = new Item();
            tokenToFeatures(list.get(0)).forEach(str -> {
                item.add(new Attribute(str));
            });
            item.add(new Attribute("singleton|true"));
            itemSequence.add(item);
        } else {
            List<List<String>> computeAllFeatures = computeAllFeatures(list);
            for (int i = 0; i < list.size(); i++) {
                Item item2 = new Item();
                computeAllFeatures.get(i).forEach(str2 -> {
                    item2.add(new Attribute(str2));
                });
                if (i > 0) {
                    computeAllFeatures.get(i - 1).forEach(str3 -> {
                        item2.add(new Attribute("prev_" + str3));
                    });
                }
                if (i < list.size() - 1) {
                    computeAllFeatures.get(i + 1).forEach(str4 -> {
                        item2.add(new Attribute("next_" + str4));
                    });
                }
                itemSequence.add(item2);
            }
        }
        return itemSequence;
    }

    private List<List<String>> computeAllFeatures(List<String> list) {
        ArrayList arrayList = new ArrayList();
        boolean z = false;
        for (int i = 0; i < list.size(); i++) {
            String str = list.get(i);
            List<String> list2 = tokenToFeatures(str);
            if (z) {
                list2.add("seen.comma|true");
            } else {
                z = str.endsWith(",");
            }
            if (i == 0) {
                list2.add("rawstring.start|true");
            } else if (i == list.size() - 1) {
                list2.add("rawstring.end|true");
            }
            arrayList.add(list2);
        }
        return arrayList;
    }
}
