/*
 * Decompiled with CFR 0.152.
 */
package org.talend.dataquality.semantic.extraction;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.talend.dataquality.semantic.extraction.ExtractFromSemanticType;
import org.talend.dataquality.semantic.extraction.MatchedPart;
import org.talend.dataquality.semantic.extraction.MatchedPartDict;
import org.talend.dataquality.semantic.extraction.TokenizedString;
import org.talend.dataquality.semantic.index.LuceneIndex;
import org.talend.dataquality.semantic.model.DQCategory;
import org.talend.dataquality.semantic.snapshot.DictionarySnapshot;

public class ExtractFromDictionary
extends ExtractFromSemanticType {
    private final LuceneIndex index = (LuceneIndex)this.initIndex();
    private static final Pattern fullSeparatorPattern = Pattern.compile("[\\p{Punct}\\s\\u00A0\\u2007\\u202F\\u3000]+");

    protected ExtractFromDictionary(DictionarySnapshot snapshot, DQCategory category) {
        super(snapshot, category);
    }

    @Override
    public List<MatchedPart> getMatches(String field) {
        TokenizedString tokenizedField = new TokenizedString(field);
        LinkedHashSet<MatchedPart> uniqueMatchedParts = new LinkedHashSet<MatchedPart>();
        uniqueMatchedParts.addAll(this.getMatchPart(tokenizedField, tokenizedField.getTokens()));
        if (tokenizedField.getValue().contains("'") || tokenizedField.getValue().contains(".")) {
            TokenizedString clone = new TokenizedString(tokenizedField.getValue());
            List<String> tokensWithoutApostrophe = this.getTokensWithoutApostropheAndDots(tokenizedField);
            clone.getTokens().clear();
            clone.getTokens().addAll(tokensWithoutApostrophe);
            uniqueMatchedParts.addAll(this.getMatchPart(clone, tokensWithoutApostrophe));
        }
        return new ArrayList<MatchedPart>(uniqueMatchedParts);
    }

    private List<MatchedPart> getMatchPart(TokenizedString tokenizedField, List<String> tokens) {
        ArrayList<MatchedPart> matchedParts = new ArrayList<MatchedPart>();
        int nbOfTokens = tokens.size();
        for (int i = 0; i < nbOfTokens; ++i) {
            int matchStart = -1;
            int matchEnd = -1;
            String luceneMatch = null;
            ArrayList<String> phrase = new ArrayList<String>();
            for (int j = i; j < nbOfTokens; ++j) {
                String tokenWithoutAccent = StringUtils.stripAccents((String)tokens.get(j));
                phrase.add(tokenWithoutAccent);
                List<String> matches = this.findMatches(phrase);
                if (matches.isEmpty()) break;
                int match = this.exactMatchIndex(tokenizedField, phrase, matches);
                if (match <= -1) continue;
                luceneMatch = matches.get(match);
                matchStart = i;
                matchEnd = j;
            }
            if (luceneMatch == null) continue;
            matchedParts.add(new MatchedPartDict(tokenizedField, matchStart, matchEnd, luceneMatch));
            i = matchEnd;
        }
        return matchedParts;
    }

    private List<String> getTokensWithoutApostropheAndDots(TokenizedString tokenizedString) {
        List<String> tokens = tokenizedString.getTokens();
        ArrayList<String> tokensWithoutApostrophe = new ArrayList<String>(Arrays.asList(fullSeparatorPattern.split(tokenizedString.getValue())));
        if (!tokensWithoutApostrophe.isEmpty() && ((String)tokensWithoutApostrophe.get(0)).isEmpty()) {
            tokens.remove(0);
        }
        return tokensWithoutApostrophe;
    }

    private List<String> findMatches(List<String> phrase) {
        return this.index.getSearcher().searchPhraseInSemanticCategory(this.semanticCategory.getId(), StringUtils.join(phrase, (char)' '));
    }

    private int exactMatchIndex(TokenizedString tokenizedField, List<String> phrase, List<String> matches) {
        Collections.sort(matches, Comparator.comparingInt(String::length).reversed());
        for (int i = 0; i < matches.size(); ++i) {
            List<String> matchTokens = tokenizedField.tokenize(StringUtils.stripAccents((String)matches.get(i)));
            if (!this.equalsIgnoreCase(matchTokens, phrase)) continue;
            return i;
        }
        return -1;
    }

    private boolean equalsIgnoreCase(List<String> tokens, List<String> phrase) {
        if (tokens == null || phrase == null) {
            return false;
        }
        if (tokens.size() != phrase.size()) {
            return false;
        }
        for (int i = 0; i < tokens.size(); ++i) {
            String word = phrase.get(i);
            if (tokens.get(i).equalsIgnoreCase(word)) continue;
            if (i == tokens.size() - 1 && word.endsWith(".")) {
                word = word.substring(0, word.length() - 1);
                return tokens.get(i).equalsIgnoreCase(word);
            }
            return false;
        }
        return true;
    }
}

