package org.talend.dataquality.semantic.extraction;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.semantic.index.LuceneIndex;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.model.ValidationMode;
import org.talend.dataquality.semantic.snapshot.DictionarySnapshot;

/* loaded from: input_file:org/talend/dataquality/semantic/extraction/ExtractFromDictionary.class */
public class ExtractFromDictionary extends ExtractFromSemanticType {
    private static final Logger LOGGER = LoggerFactory.getLogger(ExtractFromDictionary.class);
    private final LuceneIndex index;
    private boolean standardize;

    /* JADX INFO: Access modifiers changed from: protected */
    public ExtractFromDictionary(DictionarySnapshot dictionarySnapshot, DQCategory dQCategory, boolean z) {
        super(dictionarySnapshot, dQCategory);
        this.standardize = z;
        this.index = (LuceneIndex) initIndex();
    }

    @Override // org.talend.dataquality.semantic.extraction.ExtractFromSemanticType
    public List<MatchedPart> getMatches(String str) {
        TokenizedString tokenizedString = new TokenizedString(str);
        LinkedHashSet linkedHashSet = new LinkedHashSet(getMatchedParts(tokenizedString));
        if (tokenizedString.getValue().contains("'") || tokenizedString.getValue().contains(".")) {
            linkedHashSet.addAll(getMatchedParts(new TokenizedString(tokenizedString.getValue(), TokenizedString.FULL_SEPARATORS)));
        }
        return new ArrayList(linkedHashSet);
    }

    private List<MatchedPart> getMatchedParts(TokenizedString tokenizedString) {
        ArrayList arrayList = new ArrayList();
        List<String> tokens = tokenizedString.getTokens();
        int size = tokens.size();
        int i = 0;
        while (i < size) {
            MatchedPartDict matchedPartDict = null;
            ArrayList arrayList2 = new ArrayList();
            for (int i2 = i; i2 < size; i2++) {
                arrayList2.add(StringUtils.stripAccents(tokens.get(i2)));
                List<String> findMatches = findMatches(arrayList2);
                if (findMatches.isEmpty()) {
                    break;
                }
                MatchedPartDict computeBestMatch = computeBestMatch(tokenizedString, i, i2, findMatches);
                if (computeBestMatch != null) {
                    matchedPartDict = computeBestMatch;
                }
            }
            if (matchedPartDict != null) {
                arrayList.add(matchedPartDict);
                i = matchedPartDict.end;
            }
            i++;
        }
        return arrayList;
    }

    private List<String> findMatches(List<String> list) {
        return this.index.getSearcher().searchPhraseInSemanticCategory(this.semanticCategory.getId(), StringUtils.join(list, ' '));
    }

    private MatchedPartDict computeBestMatch(TokenizedString tokenizedString, int i, int i2, List<String> list) {
        TokenizedString matchingSubPart;
        MatchedPartDict matchedPartDict = null;
        list.sort(Comparator.comparingInt((v0) -> {
            return v0.length();
        }).reversed());
        for (String str : list) {
            TokenizedString tokenizedString2 = new TokenizedString(str);
            try {
                matchingSubPart = tokenizedString.getMatchingSubPart(tokenizedString2, i, i2);
            } catch (IllegalArgumentException e) {
                LOGGER.debug("Match {} can't be built from field {}.", new Object[]{str, tokenizedString, e});
            }
            if (this.standardize && equalsSimplified(tokenizedString2, matchingSubPart)) {
                matchedPartDict = new MatchedPartDict(tokenizedString, i, i2, tokenizedString2.getValue());
            } else if (!this.standardize && matchValid(tokenizedString2, matchingSubPart)) {
                matchedPartDict = new MatchedPartDict(tokenizedString, i, i2, matchingSubPart.getValue());
            }
            return matchedPartDict;
        }
        return matchedPartDict;
    }

    private boolean matchValid(TokenizedString tokenizedString, TokenizedString tokenizedString2) {
        ValidationMode validationMode = this.semanticCategory.getValidationMode();
        return validationMode == ValidationMode.EXACT ? tokenizedString.getValue().equals(tokenizedString2.getValue()) : validationMode == ValidationMode.SIMPLIFIED ? equalsSimplified(tokenizedString, tokenizedString2) : equalsIgnoreCaseAndAccents(tokenizedString.getValue(), tokenizedString2.getValue());
    }

    private boolean equalsIgnoreCaseAndAccents(String str, String str2) {
        return StringUtils.stripAccents(str).equalsIgnoreCase(StringUtils.stripAccents(str2));
    }

    private boolean equalsSimplified(TokenizedString tokenizedString, TokenizedString tokenizedString2) {
        List<String> tokens = tokenizedString.getTokens();
        List<String> tokens2 = tokenizedString2.getTokens();
        if (tokens == null || tokens2 == null || tokens.size() != tokens2.size()) {
            return false;
        }
        for (int i = 0; i < tokens.size(); i++) {
            if (!equalsIgnoreCaseAndAccents(tokens.get(i), tokens2.get(i))) {
                return false;
            }
        }
        return true;
    }
}
