/*
 * Decompiled with CFR 0.152.
 */
package org.talend.dataquality.semantic.extraction;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.dataquality.semantic.extraction.ExtractFromSemanticType;
import org.talend.dataquality.semantic.extraction.MatchedPart;
import org.talend.dataquality.semantic.extraction.MatchedPartDict;
import org.talend.dataquality.semantic.extraction.TokenizedString;
import org.talend.dataquality.semantic.index.LuceneIndex;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.model.ValidationMode;
import org.talend.dataquality.semantic.snapshot.DictionarySnapshot;

public class ExtractFromDictionary
extends ExtractFromSemanticType {
    private static final Logger LOGGER = LoggerFactory.getLogger(ExtractFromDictionary.class);
    private final LuceneIndex index;
    private boolean standardize;

    protected ExtractFromDictionary(DictionarySnapshot snapshot, DQCategory category, boolean standardize) {
        super(snapshot, category);
        this.standardize = standardize;
        this.index = (LuceneIndex)this.initIndex();
    }

    @Override
    public List<MatchedPart> getMatches(String field) {
        TokenizedString tokenizedField = new TokenizedString(field);
        LinkedHashSet<MatchedPart> uniqueMatchedParts = new LinkedHashSet<MatchedPart>(this.getMatchedParts(tokenizedField));
        if (tokenizedField.getValue().contains("'") || tokenizedField.getValue().contains(".")) {
            TokenizedString clone = new TokenizedString(tokenizedField.getValue(), "[\\p{Punct}\\s\\u00A0\\u2007\\u202F\\u3000]+");
            uniqueMatchedParts.addAll(this.getMatchedParts(clone));
        }
        return new ArrayList<MatchedPart>(uniqueMatchedParts);
    }

    private List<MatchedPart> getMatchedParts(TokenizedString tokenizedField) {
        ArrayList<MatchedPart> matchedParts = new ArrayList<MatchedPart>();
        List<String> tokens = tokenizedField.getTokens();
        int nbOfTokens = tokens.size();
        for (int i = 0; i < nbOfTokens; ++i) {
            MatchedPartDict bestMatch = null;
            ArrayList<String> phrase = new ArrayList<String>();
            for (int j = i; j < nbOfTokens; ++j) {
                String tokenWithoutAccent = StringUtils.stripAccents((String)tokens.get(j));
                phrase.add(tokenWithoutAccent);
                List<String> matches = this.findMatches(phrase);
                if (matches.isEmpty()) break;
                MatchedPartDict match = this.computeBestMatch(tokenizedField, i, j, matches);
                if (match == null) continue;
                bestMatch = match;
            }
            if (bestMatch == null) continue;
            matchedParts.add(bestMatch);
            i = bestMatch.end;
        }
        return matchedParts;
    }

    private List<String> findMatches(List<String> phrase) {
        return this.index.getSearcher().searchPhraseInSemanticCategory(this.semanticCategory.getId(), StringUtils.join(phrase, (char)' '));
    }

    private MatchedPartDict computeBestMatch(TokenizedString tokenizedField, int startToken, int endToken, List<String> matches) {
        MatchedPartDict bestMatch = null;
        matches.sort(Comparator.comparingInt(String::length).reversed());
        for (String match : matches) {
            TokenizedString tokenizedMatch = new TokenizedString(match);
            try {
                TokenizedString tokenizedFieldPart = tokenizedField.getMatchingSubPart(tokenizedMatch, startToken, endToken);
                if (this.standardize && this.equalsSimplified(tokenizedMatch, tokenizedFieldPart)) {
                    bestMatch = new MatchedPartDict(tokenizedField, startToken, endToken, tokenizedMatch.getValue());
                    break;
                }
                if (this.standardize || !this.matchValid(tokenizedMatch, tokenizedFieldPart)) continue;
                bestMatch = new MatchedPartDict(tokenizedField, startToken, endToken, tokenizedFieldPart.getValue());
                break;
            }
            catch (IllegalArgumentException e) {
                LOGGER.debug("Match {} can't be built from field {}.", new Object[]{match, tokenizedField, e});
            }
        }
        return bestMatch;
    }

    private boolean matchValid(TokenizedString match, TokenizedString fieldPart) {
        ValidationMode mode = this.semanticCategory.getValidationMode();
        if (mode == ValidationMode.EXACT) {
            return match.getValue().equals(fieldPart.getValue());
        }
        if (mode == ValidationMode.SIMPLIFIED) {
            return this.equalsSimplified(match, fieldPart);
        }
        return this.equalsIgnoreCaseAndAccents(match.getValue(), fieldPart.getValue());
    }

    private boolean equalsIgnoreCaseAndAccents(String matchValue, String fieldPart) {
        return StringUtils.stripAccents((String)matchValue).equalsIgnoreCase(StringUtils.stripAccents((String)fieldPart));
    }

    private boolean equalsSimplified(TokenizedString match, TokenizedString fieldPart) {
        List<String> matchTokens = match.getTokens();
        List<String> fieldPartTokens = fieldPart.getTokens();
        if (matchTokens == null || fieldPartTokens == null) {
            return false;
        }
        if (matchTokens.size() != fieldPartTokens.size()) {
            return false;
        }
        for (int i = 0; i < matchTokens.size(); ++i) {
            if (this.equalsIgnoreCaseAndAccents(matchTokens.get(i), fieldPartTokens.get(i))) continue;
            return false;
        }
        return true;
    }
}

